In [1]:
import scrapy
import pandas as pd

from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.log import configure_logging

from w3lib.html import remove_tags

In [2]:
df = pd.DataFrame(columns=['parking', 'date', 'floor_plan', 'timestamp', 'floor_tiles', 'description', 'time', 
                           'area', 'publisher', 'title', 'currency', 'year', 'price', 'contact_name', 'insulated_glass',
                          'bathrooms', 'parquet', 'contact_url', 'images', 'views', 'quarter', 'street', 'city',
                          'url', 'wall_tiles', 'rooms', 'central_heating'])

In [3]:
url = "https://www.piata-az.ro"

In [4]:
class PiataAZ(scrapy.Spider):
    name = "piata"
    allowed_domains = ["piata-az.ro"]
    start_urls = ['https://www.piata-az.ro/imobiliare/cluj-napoca']
    
    
    def parse(self, response):
        self.logger.info('1. A response from %s has just arrived!', response.url)  
        
        continue_urls = response.xpath('//div[@class="announcement-wrapper announcement-space"]/div//a/@href').extract()
        continue_urls = [url + continue_url for continue_url in continue_urls if continue_url[0] =="/"]
        del continue_urls[::2]   # delete every second item, which is redundant
        
        for continue_url in continue_urls:
            yield scrapy.Request(response.urljoin(continue_url), self.continue_parse)
        
    def continue_parse(self, response):
        global df
        self.logger.info('2. A response from %s has just arrived!', response.url) 
        url = response.url
        
        html_info_table = response.xpath('//div[@class="section-annoucement-details"]/div/ul/li/div').extract()
        
        keys = [remove_tags(html_info_table[i]) for i in range(0, len(html_info_table), 2)]
        values = [remove_tags(html_info_table[i]) for i in range(1, len(html_info_table), 2)]
        info_table = dict(zip(keys, values))
        
        parking = info_table['parcare'] if 'parcare' in info_table else None
        floor_tiles = info_table['gresie'] if 'gresie' in info_table else None
        area = info_table['suprafata'] if 'suprafata' in info_table else None
        publisher = info_table['Pers. fizica sau agentie'] if 'Pers. fizica sau agentie' in info_table else None
        bathrooms = info_table['bai'] if 'bai' in info_table else None
        parquet = info_table['parchet'] if 'parchet' in info_table else None
        wall_tiles = info_table['faianta'] if 'faianta' in info_table else None
        rooms = info_table['camere'] if 'camere' in info_table else None
        central_heating = info_table['centrala termica'] if 'centrala termica' in info_table else None
        street = info_table['strada'] if 'strada' in info_table and info_table['strada'] != "-" else None
        insulated_glass = info_table['termopan'] if 'termopan' in info_table else None
        floor_plan = info_table['compartimentare'] if 'compartimentare' in info_table else None
        year = info_table['an constructie'] if 'an constructie' in info_table else None
        
        price = remove_tags(response.xpath('//div[@class="sidebar--details__top__price"]/strong').extract()[0]).strip()
        currency = remove_tags(response.xpath('//div[@class="sidebar--details__top__price"]/b').extract()[0])
        
        city_neighborhood_details = response.xpath('//div[@class="sidebar--details__bottom"]/ul/li/a/text()').extract()
        city = city_neighborhood_details[0] if len(city_neighborhood_details) >= 1 else None
        quarter = city_neighborhood_details[1] if len(city_neighborhood_details) == 2 else None
        
        contact = response.xpath('//div[@class="sidebar sidebar--contact"]//a/@href').extract()
        contact_url = contact[0] if len(contact) > 0 else None
        contact = response.xpath('//div[@class="name-wrapper pull-left"]/b/text()').extract()
        contact_name = contact[0] if len(contact) > 0 else None
        
        date_time = response.xpath('//div[@class="announcement-detail__date-time pull-right"]/span/text()').extract()
        date = date_time[0]
        time = date_time[1]
        timestamp = ("-".join(date.split(".")[::-1])) + " " + time + ":00"
        
        views = response.xpath('//div[@class="announcement-detail clearfix"]/ul/li[@class="announcement-detail__list__views"]/text()')\
                .extract()[0].strip().split(" ")[0]
        
        title = response.xpath('//div[@class="col-md-12"]/h1/text()').extract()[0]
        description = response.xpath('//div[@class="offer-details__description"]/text()').extract()[0].strip()
        description_href = response.xpath('//div[@class="offer-details__description"]//a/@href').extract()
        if len(description_href) > 0 and len(description_href[0]) > len(contact_url):
            contact_url = description_href[0]
        
        image_urls = response.xpath('//a[@class="fancybox"]/@href').extract()
        images = ",".join(image_urls) if len(image_urls) > 0 else None
        
        df = df.append({'parking' : parking, \
                   'date' : date, \
                   'floor_plan' : floor_plan, \
                   'timestamp' : timestamp, \
                   'floor_tiles' : floor_tiles, \
                   'description' : description, \
                   'time' : time, \
                   'area' : area, \
                   'publisher' : publisher, \
                   'title' : title, \
                   'currency' : currency, \
                   'year' : year, \
                   'price' : price, \
                   'contact_name' : contact_name, \
                   'insulated_glass' : insulated_glass, \
                   'bathrooms' : bathrooms, \
                   'parquet' : parquet, \
                   'contact_url' : contact_url, \
                   'images' : images, \
                   'views' : views, \
                   'quarter' : quarter, \
                   'street' : street, \
                   'city' : city, \
                   'url' : url, \
                   'wall_tiles' : wall_tiles, \
                   'rooms' : rooms, \
                   'central_heating' : central_heating}, ignore_index=True)
        

In [5]:
configure_logging()
runner = CrawlerRunner()
runner.crawl(PiataAZ)
# runner.crawl(MySpider2)
d = runner.join()
d.addBoth(lambda _: reactor.stop())

reactor.run()

2020-04-06 10:49:09 [scrapy.crawler] INFO: Overridden settings:
{}
2020-04-06 10:49:09 [scrapy.extensions.telnet] INFO: Telnet Password: c1d598316badc01f
2020-04-06 10:49:09 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2020-04-06 10:49:09 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrap

2020-04-06 10:49:14 [piata] INFO: 2. A response from https://www.piata-az.ro/apartament-inchiriat-2-camere-cluj-napoca-floresti-628088 has just arrived!
2020-04-06 10:49:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.piata-az.ro/apartament-vanzare-2-camere-cluj-napoca-centru-630276> (referer: https://www.piata-az.ro/imobiliare/cluj-napoca)
2020-04-06 10:49:15 [piata] INFO: 2. A response from https://www.piata-az.ro/apartament-vanzare-2-camere-cluj-napoca-centru-630276 has just arrived!
2020-04-06 10:49:15 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.piata-az.ro/spatiu-comercial-hala-vanzare-cluj-napoca-centru-631974> (referer: https://www.piata-az.ro/imobiliare/cluj-napoca)
2020-04-06 10:49:15 [piata] INFO: 2. A response from https://www.piata-az.ro/spatiu-comercial-hala-vanzare-cluj-napoca-centru-631974 has just arrived!
2020-04-06 10:49:16 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.piata-az.ro/teren-de-vanzare-cluj-napoca-sopor-583755> (re

In [6]:
df[:10]

Unnamed: 0,parking,date,floor_plan,timestamp,floor_tiles,description,time,area,publisher,title,...,contact_url,images,views,quarter,street,city,url,wall_tiles,rooms,central_heating
0,nu,31.03.2020,Semidecomandat,2020-03-31 11:35:00,da,Capital Imobiliare propune spre închiriere un ...,11:35,90 mp,Agentie,Închiriere apartament cu 4 camere semidecomand...,...,http://capitalimobiliare.ro,/uploads/announces/2019/11/608605/6407_big__5d...,239,Europa,,Cluj-Napoca,https://www.piata-az.ro/apartament-inchiriat-4...,da,4 sau mai multe,da
1,da,06.04.2020,Decomandat,2020-04-06 09:53:00,da,"APARTAMENT DECOMANDAT,SPATIOS,SE VINDE UTILAT ...",09:53,55 mp,Persoana fizica,VAND APARTAMENT 2 CAMERE,...,,/uploads/announces/2020/03/630305/3547_IMG-201...,70,Iris,B-DUL MUNCII,Cluj-Napoca,https://www.piata-az.ro/apartament-vanzare-2-c...,da,2,da
2,da,06.04.2020,Decomandat,2020-04-06 09:53:00,da,"APARTAMENT DECOMANDAT,SPATIOS,SE VINDE UTILAT ...",09:53,55 mp,Persoana fizica,VAND APARTAMENT 2 CAMERE DECOMANDATE CU 2 BALC...,...,,/uploads/announces/2020/03/630329/3839_IMG-201...,75,Iris,B-DUL MUNCII,Cluj-Napoca,https://www.piata-az.ro/apartament-vanzare-2-c...,da,2,da
3,da,06.04.2020,Decomandat,2020-04-06 09:54:00,da,"APARTAMENT DECOMANDAT,SPATIOS,SE VINDE UTILAT ...",09:54,55 mp,Persoana fizica,VAND APARTAMENT UTILAT MODERN LA PRET AVANTAJOS,...,,/uploads/announces/2020/03/630303/5210_IMG-201...,72,Iris,B-DUL MUNCII,Cluj-Napoca,https://www.piata-az.ro/apartament-vanzare-2-c...,da,2,da
4,nu,06.04.2020,,2020-04-06 09:55:00,,"2 CAMERE CU ETAJ ,BALCON,FOISOARE,TEWRASE,5600...",09:55,80 mp,Persoana fizica,VAND CASA LA CAIANU MIC TIP CABANA LA CAIANU ...,...,,/uploads/announces/2019/09/572859/7241_U4mbRie...,111,,,Cluj-Napoca,https://www.piata-az.ro/cabana-cazare-turism-c...,,,
5,nu,06.04.2020,Decomandat,2020-04-06 09:54:00,nu,"APARTAMENT DECOMANDAT,SPATIOS,SE VINDE UTILAT ...",09:54,55 mp,Persoana fizica,"VAND AP 2 CAMERE DECOMANDAT,2 BALCOANE ORIENTA...",...,,/uploads/announces/2020/03/630334/9302_IMG-201...,78,Iris,B-DUL MUNCII,Cluj-Napoca,https://www.piata-az.ro/apartament-vanzare-2-c...,nu,2,nu
6,da,06.04.2020,Decomandat,2020-04-06 09:54:00,da,"APARTAMENT DECOMANDAT,SPATIOS,SE VINDE UTILAT ...",09:54,55 mp,Persoana fizica,VAND AP 2 CAM SPATIOASE DECOMANDAT CU 2 BALCOA...,...,,/uploads/announces/2020/03/630332/2935_IMG-201...,72,Iris,B-DUL MUNCII,Cluj-Napoca,https://www.piata-az.ro/apartament-vanzare-2-c...,da,2,da
7,nu,06.03.2020,,2020-03-06 14:13:00,nu,"Vand apartament 18, str. Ploiesti nr. 9, et. 1...",14:13,51 mp,Persoana fizica,"Vand apartament 18, str. Ploiesti nr. 9, et. 1...",...,,,10962,Centru,Ploiesti nr. 9,Cluj-Napoca,https://www.piata-az.ro/apartament-vanzare-2-c...,nu,2,nu
8,nu,11.03.2020,Decomandat,2020-03-11 15:32:00,da,"De vanzare apartament cu 2 camere, cu o supraf...",15:32,50 mp,Agentie,Apartament 2 camere de vanzare in Cluj Napoca...,...,http://www.axaimobiliarecluj.ro/vanzare-aparta...,"/uploads/announces/2020/03/629257/551473.jpg,/...",422,Manastur,,Cluj-Napoca,https://www.piata-az.ro/apartament-vanzare-2-c...,da,2,da
9,,06.04.2020,,2020-04-06 09:55:00,,"CABANA CU ETAJ,BALCON,2 CAMERE,CURENT APA SI 5...",09:55,,,VAND CASA LA CAIANU MIC TIP CABANA LA CAIANU ...,...,,/uploads/announces/2019/09/572841/5621_TgNkE4y...,201,,,Cluj-Napoca,https://www.piata-az.ro/casa-la-tara-de-vanzar...,,,
