Housing prices prediction - Barcelona
==================

### Downloading data from website

Download data from idealista and save it as a mongodb database.

In [1]:
from selenium import webdriver

def newBrowser():
    return webdriver.Firefox()
         
browser=None

In [2]:
from lxml import html
from lxml.cssselect import CSSSelector
import time
import random

#Auxiliary functions to extract information from the browser

def loadPage(page,browser=browser):
    browser.get(page)
    return browser.page_source

def loadPageTree(page_source):
    tree = html.document_fromstring(page_source)
    return tree

def firstHouse(tree):
    element = tree.cssselect('.item-link')
    return element[0].get("href") if element else None

def getListOfHouses(tree):
    houses = [elem.get("href") for elem in tree.cssselect('.item-link')]
    return houses
    
def isLastPage(tree):
    return tree.cssselect(".icon-arrow-right-after") == []
    


In [3]:
# Get the details of the house from the page

def getHouseData(tree):
    data = {}
    price = tree.cssselect(".info-data-price")
    data["price"] = price[0].text_content() if price else ""
    features = tree.cssselect(".info-features")
    data["features"] = [ it.text_content() for idx,it in enumerate(features[0].cssselect("span")) if idx%2==0]  if features else []
    descr = tree.cssselect(".adCommentsLanguage")
    data["description"] = descr[0].text_content() if descr else ""
    details = tree.cssselect(".details-property_features")
    if details:
        details_house = [ it.text_content() for it in details[0].cssselect("li")] 
        if len(details)>1:
            bdata = [d.cssselect("li") for d in details[1:]] 
            details_building = [ it.text_content() for d in bdata for it in d] 
        else:
            details_building = []
        energy = details[0].cssselect("span") 
        energy_class = energy[0].get("title") if energy else ""
    else:
        details_house = []
        details_building = []
        energy_class = ""
    data["details_house"] = details_house
    data["details_building"] = details_building
    data["energy_class"] = energy_class
    address = tree.cssselect("#headerMap")
    address_details = [it.text_content() for it in address[0].cssselect("li")][0:3] if address else []
    data["address"] = address_details
    
    return data


In [4]:
import pymongo

# Connection to Mongo DB
try:
    conn=pymongo.MongoClient()
    print("connected")
except pymongo.errors.ConnectionFailure as e:
    print ("Could not connect to MongoDB: %s" % e )

db = conn["idealista"]
collection = db["house_data"]
urls = db["urls"]


connected


In [5]:
urls.count()

  """Entry point for launching an IPython kernel.


21914

In [6]:
def getHouseId(url): # /inmueble/id/ => id
    return url.split("/")[2]
    
def saveToMongo(houseUrl,data):
    house_id = getHouseId(houseUrl)
    data["id"] = house_id
    collection.replace_one( { "id" : house_id },   data, upsert = True )
    
def saveUrl(source,url):
    urls.insert_one( {"source":source , "url":url})
    

In [7]:
# Method 1. Go to the first house page. From there go to the next indicated at "Siguiente"
#Not a good solution. Must keep browser open all the time.
#If browser closes you have to restart because you don't have a list of houses

def getNextHouseFromSource(page): #needed sometimes: for some reason getNextHouse fails
    text_next = 'class="btn nav next icon-arrow-right-after" href="'
    pstart = page.find(text_next)
    if pstart==-1:
        return None
    pstart += len(text_next)
    pend = page.find('"', pstart)+1
    return page[pstart:pend]

def getNextHouse(page,tree):
    next_link = tree.cssselect(".icon-arrow-right-after")
    next_house = next_link[0].get("href") if next_link else None
    if next_house is None: 
        next_house = getNextHouseFromSource(page)
    return next_house
    
def getHouses(from_start=True):   
    if from_start:
        p = loadPage("https://www.idealista.com/venta-viviendas/barcelona-barcelona/")
        t = loadPageTree(p)
        time.sleep(3)
        next_house = firstHouse(t)
    else:
        p = browser.page_source
        t = loadPageTree(p)
        next_house = getNextHouse(p,t)
        
    while next_house:
        print(next_house,end=" ")
        p = loadPage("https://www.idealista.com"+next_house)
        t = loadPageTree(p)
        data = getHouseData(t)
        saveToMongo(next_house,data)
        time.sleep(random.randint(43,59)+10*random.random())
        next_house = getNextHouse(p,t)
    print("done")
       

In [22]:
#Method 2. First retrieve the list of all houses
#Then you can get houses' data one by one by number
#This way you can stop and restart the browser as needed

def pause():
    time.sleep(random.randint(30,59)+8*random.random())
    
def pageFormat(start,end,page):
    res = "https://www.idealista.com/venta-viviendas/barcelona-barcelona/"
    res += "con-precio-hasta_{},precio-desde_{}/{}".format(
        end,start, "pagina-{}.htm".format(page) if page>1 else "")
    return res

def saveUrls(tree):
    #global listHouses (not needed anymore, using mongo)
    l = getListOfHouses(tree)
    #listHouses.extend(l)
    for url in l: 
        saveUrl("idealista",url)
    return len(l)
        
def getAllHouseUrls(startPrice=50000,incPrice=30000,finalPrice=3000000,startPage=1,loopLimit=100):
    pageNumber = startPage
    price = startPrice
    incr = incPrice
    zeros = 0 # sanity check. If something goes wrong, there will be many 0 urls in sequence
    
    while (price<finalPrice) and (zeros<3) and (loopLimit>0):
        pageUrl = pageFormat(price,price+incr-1,pageNumber)
        print(pageUrl,end="")
        p = loadPage(pageUrl)
        t = loadPageTree(p)
        l = saveUrls(t)
        print(" (",l,")",loopLimit)
        zeros = 0 if l>0 else zeros+1
        pause()
        if isLastPage(t):
            price += incr
            pageNumber = 1
        else:
            pageNumber += 1
        loopLimit -= 1
    print("done")        
       

In [9]:
def getHousesData(loopLimit=100,browser=browser):
    if browser is None:
        browser = newBrowser()
        time.sleep(5)
    # create a list of ursl, after a while the cursor is not available anymore
    l_urls = [url["url"] for url in urls.find() if "removed" not in url]
    random.shuffle(l_urls) # useful if threading
    #print(len(l_urls))
    for url in l_urls: 
        houseId = getHouseId(url)
        if collection.find({"id":houseId}).count()==0:
            print("(",houseId,")",end="")
            p = loadPage("https://www.idealista.com"+url,browser)
            remove_it = ("ya no está publicado en idealista" in p) or ("no hay ningún anuncio con ese código" in p)
            if remove_it:
                urls.update_one({"url":url},{"$set":{"removed":True}})
                print("(removed)",end="")
                pause()
                continue
            t = loadPageTree(p)
            data = getHouseData(t)
            if data.get("price"):
                saveToMongo(url,data)
            else:
                print("error")
                return False
            pause()            
            loopLimit -= 1        
            if loopLimit<1: break
    print("done") 
    return True
        

In [10]:
from threading import Thread
t=[]


In [23]:
def doIt():
    browser = newBrowser()
    while getHousesData(25,browser):        
        time.sleep(300)
    browser.close()

In [37]:
for i in range(3):
    tr = Thread(target=doIt)
    t.append(tr)
    tr.start()
    time.sleep(1)

  # This is added back by InteractiveShellApp.init_path()


( 38139448 )( 87749244 )( 87771772 )( 87533994 )( 87323774 )( 87423263 )(removed)

# using proxy with selenium

myProxy = "localhost:9050" 

profile = webdriver.FirefoxProfile() 

profile.set_preference("network.proxy.type", 1)

profile.set_preference("network.proxy.socks", "127.0.0.1")

profile.set_preference("network.proxy.socks_port", 9050)

profile.update_preferences() 

browser = webdriver.Firefox(firefox_profile=profile)


In [38]:
for i,tr in enumerate(t):
    if not tr.isAlive():
        del(t[i])
t

[<Thread(Thread-7, started 140666466719488)>,
 <Thread(Thread-9, started 140666449671936)>,
 <Thread(Thread-11, started 140666431313664)>,
 <Thread(Thread-15, started 140666458326784)>,
 <Thread(Thread-16, started 140666483504896)>,
 <Thread(Thread-18, started 140666441279232)>,
 <Thread(Thread-19, started 140666475112192)>,
 <Thread(Thread-20, started 140666775451392)>]

  # This is added back by InteractiveShellApp.init_path()


( 87237788 )( 87886663 )(removed)( 83633726 )( 87777802 )( 86814440 )( 86953528 )( 87190680 )( 87824606 )( 87265229 )( 83197351 )( 87234300 )( 87228090 )(removed)( 87138502 )(removed)( 87065915 )( 87302869 )( 85279453 )( 85891074 )( 87244243 )( 86784589 )( 36877463 )( 87678064 )( 87756881 )( 83037658 )( 85873552 )( 84103685 )( 87874384 )(removed)( 87359977 )( 85291352 )( 87895566 )( 87840196 )( 87068086 )( 84113230 )( 87263481 )(removed)( 27684466 )( 87039560 )( 87842029 )( 87267384 )( 87970407 )( 84938957 )( 87884335 )( 37708039 )( 87355407 )( 87717417 )( 85119865 )( 87171554 )( 87763775 )( 87885804 )( 85986381 )( 85924039 )(removed)done
( 85977769 )( 87916814 )(removed)( 87889485 )( 35664089 )( 87522461 )( 87445549 )( 85957215 )( 87819048 )( 85494199 )( 84272822 )( 86708983 )(removed)( 85875624 )( 87970624 )( 83647628 )( 86611898 )( 87752636 )( 84890831 )( 87822770 )( 87923852 )( 87633241 )(removed)( 87522474 )( 86781430 )( 87520595 )( 87922184 )(removed)( 40326252 )( 87475622 )( 849

Exception in thread Thread-11:
Traceback (most recent call last):
  File "/home/egon/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/home/egon/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-11-054180f8e0bd>", line 3, in doIt
    while getHousesData(25,browser):
  File "<ipython-input-9-07495286645d>", line 13, in getHousesData
    p = loadPage("https://www.idealista.com"+url,browser)
  File "<ipython-input-2-958e117ab251>", line 9, in loadPage
    browser.get(page)
  File "/home/egon/anaconda3/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
    self.execute(Command.GET, {'url': url})
  File "/home/egon/anaconda3/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/home/egon/anaconda3/lib/python3.7/site-packages/selenium/webdriver/remot

( 87196537 )

Exception in thread Thread-9:
Traceback (most recent call last):
  File "/home/egon/anaconda3/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/home/egon/anaconda3/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-11-054180f8e0bd>", line 3, in doIt
    while getHousesData(25,browser):
  File "<ipython-input-9-07495286645d>", line 13, in getHousesData
    p = loadPage("https://www.idealista.com"+url,browser)
  File "<ipython-input-2-958e117ab251>", line 9, in loadPage
    browser.get(page)
  File "/home/egon/anaconda3/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
    self.execute(Command.GET, {'url': url})
  File "/home/egon/anaconda3/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/home/egon/anaconda3/lib/python3.7/site-packages/selenium/webdriver/remote

( 87179270 )( 87125199 )( 87021871 )(removed)( 87838179 )( 87969244 )(removed)( 87671994 )( 85207398 )(removed)( 87504610 )( 87967961 )( 87836018 )(removed)( 87543989 )( 87823445 )( 86731807 )( 87058465 )( 86947101 )( 39346908 )(removed)( 82937022 )( 87437233 )( 87807766 )( 86426143 )( 87870677 )(removed)( 84445971 )( 82532727 )( 82631470 )( 86345121 )( 87100531 )(removed)( 86218787 )( 84901874 )( 87115486 )(removed)( 85525116 )done
( 84065430 )(removed)( 83598861 )( 81167957 )( 86141149 )( 35843702 )( 87511288 )(removed)( 87805835 )(removed)( 86231738 )( 87705158 )( 84942036 )( 84520925 )( 87870472 )( 85947967 )( 81965812 )( 38429513 )( 87243991 )( 87921200 )( 85495690 )( 87852629 )( 87940426 )(removed)( 85182078 )( 87410560 )( 85341775 )( 87729116 )( 86394746 )( 82273810 )( 39184627 )( 35970578 )( 87383137 )( 87753626 )(removed)( 86427726 )( 87111535 )