In [None]:
from lxml import html
from lxml.cssselect import CSSSelector
import requests
from collections import namedtuple
import time
import random
import logging
from scrapers.scaper import *
import json

In [None]:
pages_cache = dict()
# create formatter and add it to the handlers
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('subito')
logger.setLevel(logging.DEBUG)

ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)

fh = logging.FileHandler('subito.log')
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)

logger.addHandler(ch)
logger.addHandler(fh)

In [None]:
base_url = 'http://www.subito.it/annunci-lombardia/vendita/appartamenti/milano/milano/?sqs=4&o={}'
max_counter = 100

In [None]:
def list_page_url_generator(base_url, max_counter):
    counter = 1
    while counter <= max_counter:
        yield base_url.format(str(counter))
        counter +=1

In [None]:
def extract_record(elem):
    
    id_title_url = [(link.attrib['name'], link.attrib['title'], link.attrib['href'])
                    for link in elem.xpath('div/div[2]/h2/a')]
    
    if len(id_title_url) == 0:
        raise ValueError("advertising element")

    ids = [i for i,_,_ in id_title_url]
    titles = [t for _,t,_ in id_title_url]
    urls = [u for _,_,u in id_title_url]

    prices = [e.text.strip() for e in elem.cssselect('span.item_price')]
    categories = [e.text.strip() for e in elem.cssselect('span.item_category')]
    specs = [e.text.strip() for e in elem.cssselect('span.item_specs')]
    datetimes = [e.attrib['datetime'] for e in elem.cssselect('time')]
    locations = [e.text_content().strip() for e in elem.cssselect('span.item_location')]
    
    
    return {
        'ids': ids,
        'titles': titles,
        'urls': urls,
        'prices': prices,
        'categories': categories,
        'datetime': datetimes,
        'location': locations
    }

In [None]:
def extract_records(url):
    page_tree = parse(logger,url)
    elements = page_tree.xpath('//article')
    records = []
    for elem in elements:
        try:
            records.append(extract_record(elem))
        except ValueError as e:
            logger.debug('removed record with content %s', elem.text_content())
        
    return records

In [None]:
def extract_detail_page(url):
    tree = parse(logger, url)
#     titles = [t.text_content() for t in tree.cssselect('div.title')]
    price = [e.text for e in tree.cssselect('span.price')]
    
    map_details = {}
    for row in tree.xpath('//*[@id="ad_details"]/div[1]/table/tr'):
        children = row.getchildren()
        map_details[children[0].text] = children[1].text.strip()
    
    logger.info('extracted details for page %s', url)
    return {
#         'titles': titles,
        'details': map_details
    }

In [None]:
def extract():
    list_pages_it = list_page_url_generator(base_url,max_counter)
    results = []
    for url in list_pages_it:
        logger.info('extracing records from url %s', url)
        records = extract_records(url)
        if len(records) == 0:
            break
        logger.info('extracting details for %d records', len(records))
        extended = []
        for r in records:
            for u in r['urls']:
                union = dict(r,  **extract_detail_page(u))
                extended.append(union)
            
        results.extend(extended)
        break
    return results

In [None]:
results = extract()

In [None]:
print(json.dumps(results[1]))

In [None]:
len(results)

## Test

In [None]:
generator = list_page_url_generator(base_url,max_counter)

In [None]:
url = next(generator)

In [None]:
url

In [None]:
records = extract_records(url)

In [None]:
records

In [None]:
page_tree = parse(logger,url)

In [None]:
elements = page_tree.xpath('//article')

In [None]:
elem = elements[0]

In [None]:
a = elem.xpath('div/div[2]/h2/a')

In [None]:
url = 'http://www.subito.it/appartamenti/mm-passante-ferroviario-bovisa-milano-164145080.htm'

In [None]:
page = parse(logger, url)

In [None]:
descriptions = [e.text for e in page.cssselect('div.description')]

In [None]:
descriptions

In [None]:
rows = tree.xpath('//*[@id="ad_details"]/div[1]/table/tr')

In [None]:
r = rows[0]

In [None]:
r.getchildren()[0].text

In [None]:
r.getchildren()[1].text