Housing prices prediction - Barcelona
==================

### Downloading data from website

Download data from idealista and save it as a mongodb database.

In [157]:
from selenium import webdriver

def newBrowser():
    return webdriver.Firefox()
         


In [135]:
from lxml import html
from lxml.cssselect import CSSSelector
import time

def loadPage(page):
    browser.get(page)
    return browser.page_source

def loadPageTree(page_source):
    tree = html.document_fromstring(page_source)
    return tree

def firstHouse(tree):
    element = tree.cssselect('.item-link')
    if element:
        return element[0].get("href")
    return None

def getListOfHouses(tree):
    houses = [elem.get("href") for elem in tree.cssselect('.item-link')]
    return houses
    
def isLastPage(tree):
    return tree.cssselect(".icon-arrow-right-after") == []
    


In [99]:
def getHouseData(tree):
    data = {}
    price = tree.cssselect(".info-data-price")
    data["price"] = price[0].text_content() if price else ""
    features = tree.cssselect(".info-features")
    data["features"] = [ it.text_content() for idx,it in enumerate(features[0].cssselect("span")) if idx%2==0]  if features else []
    descr = tree.cssselect(".adCommentsLanguage")
    data["description"] = descr[0].text_content() if descr else ""
    details = t.cssselect(".details-property_features")
    if details:
        details_house = [ it.text_content() for it in details[0].cssselect("li")] 
        if len(details)>1:
            bdata = [d.cssselect("li") for d in details[1:]] 
            details_building = [ it.text_content() for d in bdata for it in d] 
        else:
            details_building = []
        energy = details[0].cssselect("span") 
        energy_class = energy[0].get("title") if energy else ""
    else:
        details_house = []
        details_building = []
        energy_class = ""
    data["details_house"] = details_house
    data["details_building"] = details_building
    data["energy_class"] = energy_class
    address = t.cssselect("#headerMap")
    address_details = [it.text_content() for it in address[0].cssselect("li")][0:3] if address else []
    data["address"] = address_details
    
    return data


In [185]:
import pymongo

# Connection to Mongo DB
try:
    conn=pymongo.MongoClient()
    print("connected")
except pymongo.errors.ConnectionFailure as e:
    print ("Could not connect to MongoDB: %s" % e )

db = conn["idealista"]
collection = db["houses2"]



connected


In [149]:
def saveToMongo(houseUrl,data):
    house_id = houseUrl.split("/")[2]
    data["id"] = house_id
    collection.replace_one( { "id" : house_id },   data, upsert = True )

In [205]:
browser = newBrowser()

In [161]:
browser.get("https://www.idealista.com/venta-viviendas/barcelona-barcelona/")

In [202]:
import random

listHouses = []

def getNextHouseFromSource(page):
    text_next = 'class="btn nav next icon-arrow-right-after" href="'
    pstart = page.find(text_next)
    if pstart==-1:
        return None
    pstart += len(text_next)
    pend = page.find('"', pstart)+1
    return page[pstart:pend]

def getNextHouse(page,tree):
    next_link = tree.cssselect(".icon-arrow-right-after")
    next_house = next_link[0].get("href") if next_link else None
    if next_house is None: 
        next_house = getNextHouseFromSource(page)
    return next_house

def pageFormat(start,end,page):
    res = "https://www.idealista.com/venta-viviendas/barcelona-barcelona/"
    res += "con-precio-hasta_{},precio-desde_{}/{}".format(
        end,start, "pagina-{}.htm".format(page) if page>1 else "")
    return res

def mainLoop2(startPrice=50000,incPrice=50000,finalPrice=3000000,startPage=1):
    pageNumber = startPage
    price = startPrice
    incr = incPrice
    global listHouses
    while price<finalPrice:
        pageUrl = pageFormat(price,price+incr,pageNumber)
        print(pageUrl)
        p = loadPage(pageUrl)
        t = loadPageTree(p)
        l = getListOfHouses(t)
        print(l)
        listHouses.extend(l)
        time.sleep(random.randint(11,20)+10*random.random())
        if isLastPage(t):
            price += incr
            pageNumber = 1
        else:
            pageNumber += 1
    print("done")        
        
    
    
#Not a good solution. Must keep browser open if you want to restart.
#If browser closes you have to restart because you don't have a list of houses
def mainLoop(from_start=True):   
    if from_start:
        p = loadPage("https://www.idealista.com/venta-viviendas/barcelona-barcelona/")
        t = loadPageTree(p)
        time.sleep(3)
        next_house = firstHouse(t)
    else:
        p = browser.page_source
        t = loadPageTree(p)
        next_house = getNextHouse(p,t)
        
    while next_house:
        print(next_house,end=" ")
        p = loadPage("https://www.idealista.com"+next_house)
        t = loadPageTree(p)
        data = getHouseData(t)
        saveToMongo(next_house,data)
        time.sleep(random.randint(43,59)+10*random.random())
        next_house = getNextHouse(p,t)
    print("done")
    
    

 
    

In [204]:
browser

<selenium.webdriver.firefox.webdriver.WebDriver (session="f39f712f-70ba-4cf5-bf7d-55eacc0d6e0a")>

In [203]:
mainLoop2()

https://www.idealista.com/venta-viviendas/barcelona-barcelona/con-precio-hasta_100000,precio-desde_50000/
['/inmueble/82612668/', '/inmueble/87706449/', '/inmueble/87756990/', '/inmueble/87695282/', '/inmueble/87519763/', '/inmueble/87774604/', '/inmueble/87850465/', '/inmueble/84664399/', '/inmueble/86935109/', '/inmueble/87515466/', '/inmueble/85764466/', '/inmueble/87903080/', '/inmueble/37124609/', '/inmueble/85711258/', '/inmueble/87957922/', '/inmueble/87934541/', '/inmueble/87955509/', '/inmueble/87744198/', '/inmueble/87939083/', '/inmueble/87802356/', '/inmueble/81902652/', '/inmueble/87538964/', '/inmueble/86568857/', '/inmueble/82840221/', '/inmueble/87393330/', '/inmueble/87918338/', '/inmueble/87598310/', '/inmueble/87518241/', '/inmueble/83272755/', '/inmueble/87933063/']
https://www.idealista.com/venta-viviendas/barcelona-barcelona/con-precio-hasta_100000,precio-desde_50000/pagina-2.html
[]
https://www.idealista.com/venta-viviendas/barcelona-barcelona/con-precio-hasta_15

NoSuchWindowException: Message: Browsing context has been discarded


In [189]:
collection.find_one()

{'_id': ObjectId('5de777e67aaa0f4e190bfc21'),
 'price': '349.000 €',
 'features': [' 91 m² ', ' 5 hab. ', ' 4ª planta con ascensor '],
 'description': ' "Espacioso piso a reformar situado en un edificio de 1930 en el acogedor barrio de la Nova Esquerra de l’Eixample. Se accede al edificio por una amplia entrada con ascensor. La escalera tiene ventana al exterior en cada rellano, por lo cual es también luminosa y aireada. Una vez en la vivienda nos damos cuenta de la luz que tiene, ya que se trata de un piso pasante y todas las estancias tienen luz natural, ya sea de la calle, del patio de manzana o del patio de luces descubierto. Actualmente dispone de una amplia entrada, 5 habitaciones (1 de ellas ideal para despacho o vestidor), 1 baño, cocina independiente, lavadero y salón-comedor. Las posibilidades de reforma son diferentes e interesantes según los espacios que se deseen crear por ser las paredes mayoritariamente tabiques.La propiedad está situada en La Nova Esquerra de l’Eixample

In [182]:
print(t.cssselect(".icon-arrow-right-after")[0].get("href"))

/inmueble/87916907/


In [178]:
p = browser.page_source


'/inmueble/87916907/"'