In [1]:
import scrapy
import pandas as pd

from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.selector import Selector
from scrapy.http import Request
from scrapy.utils.log import configure_logging

from w3lib.html import remove_tags

In [2]:
df = pd.DataFrame(columns=['parking', 'date', 'floor_plan', 'timestamp', 'floor_tiles', 'description', 'time', 
                           'area', 'publisher', 'title', 'currency', 'year', 'price', 'contact_name', 'insulated_glass',
                          'bathrooms', 'parquet', 'contact_url', 'images', 'views', 'quarter', 'street', 'city',
                          'url', 'wall_tiles', 'rooms', 'central_heating'])

In [3]:
url = "https://www.piata-az.ro"

In [4]:
class PiataAZ(scrapy.Spider):
    name = "piata"
    allowed_domains = ["piata-az.ro"]
#     start_urls = ['https://www.piata-az.ro/imobiliare/cluj-napoca']
    start_urls = ['https://www.piata-az.ro/apartament-vanzare-3-camere-cluj-napoca-marasti-608096']
    
    
    def parse(self, response):
        self.logger.info('1. A response from %s has just arrived!', response.url)  
        
        continue_urls = response.xpath('//div[@class="announcement-wrapper announcement-space"]/div//a/@href').extract()
        continue_urls = [url + continue_url for continue_url in continue_urls if continue_url[0] =="/"]
        del continue_urls[::2]   # delete every second item, which is redundant
        
        continue_urls = ['https://www.piata-az.ro/apartament-vanzare-3-camere-cluj-napoca-marasti-608096']
        for continue_url in continue_urls:
            yield scrapy.Request(response.urljoin(continue_url), self.continue_parse)
        
    def continue_parse(self, response):
        global df
        self.logger.info('2. A response from %s has just arrived!', response.url) 
        url = response.url
        
        html_info_table = response.xpath('//div[@class="section-annoucement-details"]/div/ul/li/div').extract()
        
        keys = [remove_tags(html_info_table[i]) for i in range(0, len(html_info_table), 2)]
        values = [remove_tags(html_info_table[i]) for i in range(1, len(html_info_table), 2)]
        info_table = dict(zip(keys, values))
        
        parking = info_table['parcare'] if 'parcare' in info_table else None
        floor_tiles = info_table['gresie'] if 'gresie' in info_table else None
        area = info_table['suprafata'] if 'suprafata' in info_table else None
        publisher = info_table['Pers. fizica sau agentie'] if 'Pers. fizica sau agentie' in info_table else None
        bathrooms = info_table['bai'] if 'bai' in info_table else None
        parquet = info_table['parchet'] if 'parchet' in info_table else None
        wall_tiles = info_table['faianta'] if 'faianta' in info_table else None
        rooms = info_table['camere'] if 'camere' in info_table else None
        central_heating = info_table['centrala termica'] if 'centrala termica' in info_table else None
        street = info_table['strada'] if 'strada' in info_table and info_table['strada'] != "-" else None
        insulated_glass = info_table['termopan'] if 'termopan' in info_table else None
        floor_plan = info_table['compartimentare'] if 'compartimentare' in info_table else None
        year = info_table['an constructie'] if 'an constructie' in info_table else None
        
        price = remove_tags(response.xpath('//div[@class="sidebar--details__top__price"]/strong').extract()[0]).strip()
        currency = remove_tags(response.xpath('//div[@class="sidebar--details__top__price"]/b').extract()[0])
        
        city_neighborhood_details = response.xpath('//div[@class="sidebar--details__bottom"]/ul/li/a/text()').extract()
        city = city_neighborhood_details[0] if len(city_neighborhood_details) >= 1 else None
        quarter = city_neighborhood_details[1] if len(city_neighborhood_details) == 2 else None
        
        contact = response.xpath('//div[@class="sidebar sidebar--contact"]//a/@href').extract()
        contact_url = contact[0] if len(contact) > 0 else None
        contact = response.xpath('//div[@class="name-wrapper pull-left"]/b/text()').extract()
        contact_name = contact[0] if len(contact) > 0 else None
        
        date_time = response.xpath('//div[@class="announcement-detail__date-time pull-right"]/span/text()').extract()
        date = date_time[0]
        time = date_time[1]
        timestamp = ("-".join(date.split(".")[::-1])) + " " + time + ":00"
        
        views = response.xpath('//div[@class="announcement-detail clearfix"]/ul/li[@class="announcement-detail__list__views"]/text()')\
                .extract()[0].strip().split(" ")[0]
        
        title = response.xpath('//div[@class="col-md-12"]/h1/text()').extract()[0]
        description = response.xpath('//div[@class="offer-details__description"]/text()').extract()[0].strip()
        if "href" in description:
            print("!FOUND")
        
        image_urls = response.xpath('//a[@class="fancybox"]/@href').extract()
        images = ",".join(image_urls) if len(image_urls) > 0 else None
        
        df = df.append({'parking' : parking, \
                   'date' : date, \
                   'floor_plan' : floor_plan, \
                   'timestamp' : timestamp, \
                   'floor_tiles' : floor_tiles, \
                   'description' : description, \
                   'time' : time, \
                   'area' : area, \
                   'publisher' : publisher, \
                   'title' : title, \
                   'currency' : currency, \
                   'year' : year, \
                   'price' : price, \
                   'contact_name' : contact_name, \
                   'insulated_glass' : insulated_glass, \
                   'bathrooms' : bathrooms, \
                   'parquet' : parquet, \
                   'contact_url' : contact_url, \
                   'images' : images, \
                   'views' : views, \
                   'quarter' : quarter, \
                   'street' : street, \
                   'city' : city, \
                   'url' : url, \
                   'wall_tiles' : wall_tiles, \
                   'rooms' : rooms, \
                   'central_heating' : central_heating}, ignore_index=True)
        

In [5]:
configure_logging()
runner = CrawlerRunner()
runner.crawl(PiataAZ)
# runner.crawl(MySpider2)
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run()

2020-04-03 16:23:14 [scrapy.crawler] INFO: Overridden settings:
{}
2020-04-03 16:23:14 [scrapy.extensions.telnet] INFO: Telnet Password: 8b768dbfc6d7eafa
2020-04-03 16:23:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-04-03 16:23:14 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrap

In [6]:
df

Unnamed: 0,parking,date,floor_plan,timestamp,floor_tiles,description,time,area,publisher,title,...,contact_url,images,views,quarter,street,city,url,wall_tiles,rooms,central_heating
