In [5]:
import os
import requests
import requests
import ast

import pandas as pd
import glob
import plotly.express as px

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from time import sleep

from locations import *
from parsing import *

# LOCATIONS SCRAPING

In [66]:
# Base

base_url = 'https://www.pisos.com/'

venta_url = f'{base_url}venta/'

In [67]:
# Sacamos todas las paginas filtradas por provincias

response = requests.get(base_url)

soup = BeautifulSoup(response.text, "html.parser")

provincias_str = soup.find('div', class_ = 'home-container').find('div', class_ = 'selectBox').find_all('ul')[-1].text

provincias = [x.lower() for x in provincias_str.strip().split('\n')]

endpoints = replaceWithUnderscore(provincias)

endpoints = [venta_url + endpoint for endpoint in endpoints]

with open('locations.txt', 'w') as file:
    file.write('\n'.join(endpoints))

# SINGLE PAGE SCRAPING

In [68]:
def scrape(urls):
    browser = webdriver.Chrome()

    for idx, url in enumerate(urls):
        browser.get(url)
        # browser.maximize_window()

        if idx == 0:
            element = WebDriverWait(browser, 10).until(
                EC.presence_of_element_located((By.XPATH, '//*[@id="didomi-notice-agree-button"]'))
            )

            element.click() # Accept cookies

        # <SCROLLING>
        while True:
            is_at_bottom = browser.execute_script("return window.scrollY + window.innerHeight >= document.body.scrollHeight")
            browser.execute_script("window.scroll({ top: document.body.scrollHeight, behavior: 'smooth' });")
            if is_at_bottom:
                break
            sleep(0.2)
        # </SCROLLING>
        
        html_content = browser.page_source
        soft_url = url.replace('https://www.pisos.com/comprar/', '')
        soft_url = soft_url.replace('/', '_')
        soft_url = soft_url.replace('-', '_')

        file_path = f'{os.getcwd()}/html_content/{soft_url}.html'
        try:
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(html_content)
        except Exception as e:
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(repr(e))

    browser.quit()

    

# TESTING

In [69]:
def scanRegions(url):
    response = requests.get(url)

    print(f'URL: {url} | STATUS {response.status_code}')

    soup = BeautifulSoup(response.text, 'html.parser')

    items = soup.select('div.zoneList a.item:not(.item-subitem)')

    endpoints = {}
    for item in items:
        endpoint = item['href']
        n_results = item.find('span', class_ = 'total').text

        # <INT>
        if len(n_results) != 0:
            try:
                n_results = n_results[1:-1]
                n_results = ''.join(n_results.split('.'))
                n_results = int(n_results)
            except:
                print(f'FAIL CASTING TO INTEGER {endpoint}')
        else:
            print(f'n_results EMPTY {endpoint}')
        #</INT>

        # <RECURSSION>
        if n_results > 3000:
            endpoints[endpoint] = scanRegions(base_url[:-1] + endpoint)
        # </RECURSSION>

        else: endpoints[endpoint] = n_results

    return endpoints

In [70]:
def parseRegions(endpoints):

    def extract(endpoints):
        array = []
        for key, value in endpoints.items():
            if isinstance(value, int):
                array.append(key)
                continue

            data = extract(value)
            array.extend(data)

        return array
    
    endpoints = extract(endpoints)

    array = []
    
    for endpoint in endpoints:
        data = endpoint
        if '/venta/pisos-' in endpoint:
            data = endpoint.replace('/venta/pisos-', '/viviendas/')
        array.append(data)

    return array

In [71]:
URL = 'https://www.pisos.com/viviendas/madrid/'

urls = scanRegions(URL)
urls = parseRegions(urls)
urls = [base_url + x for x in urls]
urls = [x.replace('//viviendas/', '/venta/pisos-') for x in urls]
print(urls)

URL: https://www.pisos.com/viviendas/madrid/ | STATUS 200
URL: https://www.pisos.com/viviendas/madrid_capital/ | STATUS 200
['https://www.pisos.com/venta/pisos-corredor_del_henares/', 'https://www.pisos.com/venta/pisos-arganzuela/', 'https://www.pisos.com/venta/pisos-madrid_capital_barajas/', 'https://www.pisos.com/venta/pisos-madrid_capital_carabanchel/', 'https://www.pisos.com/venta/pisos-madrid_capital_centro/', 'https://www.pisos.com/venta/pisos-madrid_capital_chamartin/', 'https://www.pisos.com/venta/pisos-chamberi_distrito/', 'https://www.pisos.com/venta/pisos-ciudad_lineal/', 'https://www.pisos.com/venta/pisos-fuencarral_el_pardo/', 'https://www.pisos.com/venta/pisos-hortaleza/', 'https://www.pisos.com/venta/pisos-latina/', 'https://www.pisos.com/venta/pisos-moncloa_aravaca/', 'https://www.pisos.com/venta/pisos-moratalaz/', 'https://www.pisos.com/venta/pisos-puente_de_vallecas/', 'https://www.pisos.com/venta/pisos-madrid_capital_retiro/', 'https://www.pisos.com/venta/pisos-madri

In [72]:
def scrapeUrls(endpoint):
    response = requests.get(endpoint)

    soup = BeautifulSoup(response.text, 'html.parser')

    results = soup.find('div', class_ = 'grid__title').find_all('span')[-1].text

    n_results = int(''.join([x for x in results if x.isnumeric()]))
    n_pages = (n_results // 30) + 1

    urls = []
    for i in range(n_pages):
        url = f'{endpoint}{i + 1}'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        ads = soup.find_all('a', class_ = 'ad-preview__title')
        urls.extend([x['href'] for x in ads])
    return urls

In [73]:
for url in urls:
    urls_ = scrapeUrls(url)
    urls_ = list(set(urls_))
    urls_ = [base_url[:-1] + x for x in urls_]
    with open('urls.csv', 'a+') as file:
        file.writelines([x + ',\n' for x in urls_])

In [75]:
with open('urls.csv') as file:
    urls = file.read()

urls = urls.split(',\n')

In [78]:
scrape(urls[8_846:])

InvalidArgumentException: Message: invalid argument
  (Session info: chrome=117.0.5938.150)
Stacktrace:
	GetHandleVerifier [0x00007FF765B47892+54818]
	(No symbol) [0x00007FF765AB6AC2]
	(No symbol) [0x00007FF76596D8ED]
	(No symbol) [0x00007FF76595C5B9]
	(No symbol) [0x00007FF76595A961]
	(No symbol) [0x00007FF76595B123]
	(No symbol) [0x00007FF76596FE4F]
	(No symbol) [0x00007FF7659E7917]
	(No symbol) [0x00007FF7659CEAAA]
	(No symbol) [0x00007FF7659E75A2]
	(No symbol) [0x00007FF7659CE883]
	(No symbol) [0x00007FF7659A3691]
	(No symbol) [0x00007FF7659A48D4]
	GetHandleVerifier [0x00007FF765EAB992+3610402]
	GetHandleVerifier [0x00007FF765F01860+3962352]
	GetHandleVerifier [0x00007FF765EF9D4F+3930847]
	GetHandleVerifier [0x00007FF765BE3646+693206]
	(No symbol) [0x00007FF765AC1628]
	(No symbol) [0x00007FF765ABD934]
	(No symbol) [0x00007FF765ABDA62]
	(No symbol) [0x00007FF765AAE113]
	BaseThreadInitThunk [0x00007FFA88CA7344+20]
	RtlUserThreadStart [0x00007FFA892C26B1+33]


In [79]:
data = {
    'price' : [],
    'title' : [],
    'location' : [],
    'lat' : [],
    'lng' : [],
    'characteristics' : [],
}

files = glob.glob('html_content/*.html')

for file in files:
    with open(file, encoding='utf-8') as f:
        source = f.read()
    soup = BeautifulSoup(source, 'html.parser')

    price = getPrice(soup)
    title = getTitle(soup)
    location = getLocation(soup)
    lat, long = getLatLong(soup)
    characteristics = getCharacteristics(soup)

    data['price'].append(price)
    data['title'].append(title)
    data['location'].append(location)
    data['lat'].append(lat)
    data['lng'].append(long)
    data['characteristics'].append(characteristics)

df = pd.DataFrame(data)

In [80]:
df.to_csv('madrid.csv')

In [None]:
####################################################################################################################

In [10]:
df = pd.read_csv('madrid.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [11]:
def tryInt(n):
    try:
        return int(''.join(n[:-2].split('.')))
    except: return np.nan

In [12]:
def tryLiteralEval(row):
    try:
        return ast.literal_eval(row)
    except:
        return lambda _: np.nan


# DATA PROCESSING

In [13]:
df['price'] = df['price'].map(lambda x: tryInt(x))
df['characteristics'] = df['characteristics'].apply(tryLiteralEval)

In [14]:
df

Unnamed: 0,price,title,location,lat,lng,characteristics
0,176000.0,Apartamento en venta en Abantos-Monte Carmelo-...,Abantos-Monte Carmelo-El Rosario (San Lorenzo ...,40.593445,-4.145386,"[Superficie construida : 63 m², Superficie úti..."
1,116500.0,"Apartamento en venta en Calle de Ercilla, 20, ...","Calle de Ercilla, 20, cerca de Calle del Labra...",40.402079,-3.702151,"[Superficie construida : 25 m², Baños : 1, Pla..."
2,169000.0,Apartamento en venta en Ajalvir,Ajalvir,40.534457,-3.479415,"[Superficie construida : 70 m², Superficie úti..."
3,174000.0,Apartamento en venta en Alcorcón,Casco Antiguo (Alcorcón),40.344358,-3.825283,"[Superficie construida : 60 m², Superficie úti..."
4,130000.0,Apartamento en venta en Alcorcón,Casco Antiguo (Alcorcón),40.347096,-3.827826,"[Superficie construida : 60 m², Superficie úti..."
...,...,...,...,...,...,...
15832,135000.0,Piso en venta en Calle de la Circunvalación,Calle de la Circunvalación. Zona Suroeste (Tor...,40.454606,-3.455234,"[Superficie construida : 67 m², Superficie úti..."
15833,210000.0,Piso en venta en Calle de Urano,Calle de Urano. Zona Suroeste (Torrejón de Ardoz),40.441928,-3.473036,"[Superficie construida : 79 m², Superficie úti..."
15834,324600.0,Piso en venta en Fresnos,Zona Suroeste (Torrejón de Ardoz),40.448403,-3.470899,"[Superficie construida : 140 m², Habitaciones ..."
15835,249900.0,Piso en venta en Calle de Cibeles,Calle de Cibeles. Zona Suroeste (Torrejón de A...,40.444707,-3.473520,"[Superficie construida : 106 m², Superficie út..."


#### CHARACTERISTICS PARSING

In [15]:
# HAY QUE REFACTORIZAR ESTE CHURRO HORRIBLE!!!!!!!

columns = []
characteristics = df['characteristics'].iloc

for chars in characteristics:
    for char in chars:
        vals = char.split(':')
        if isinstance(vals, list):
            columns.append(vals[0])
        else:
            columns.append(vals)

columns = [x.strip() for x in columns]
columns = list(set(columns))
data = []

for chars in characteristics:

    dict_data = {}

    for char in chars:
        content = char.split(':')
        if len(content) == 2:
            key, value = content
            key = key.strip()
            dict_data[key] = value
        else:
            continue

    dict_columns = {}

    for column in columns:
        for key, value in dict_data.items():
            if key == column:
                dict_columns[column] = value
        
    for column in columns:
        if dict_columns.get(column) is None:
            dict_columns[column] = np.nan

    data.append(dict_columns)

# HAY QUE REFACTORIZAR ESTE CHURRO HORRIBLE!!!!!!!

In [16]:
df_characteristics = pd.DataFrame(data)

In [17]:
df_ = pd.concat([df, df_characteristics], axis = 1).drop('characteristics', axis = 1)

In [18]:
df_['Clasificación']

0           En trámite
1           En trámite
2           En trámite
3          No indicado
4          No indicado
             ...      
15832      No indicado
15833       En trámite
15834      No indicado
15835       En trámite
15836      No indicado
Name: Clasificación, Length: 15837, dtype: object

In [35]:
df_.to_csv('madrid_parsed.csv', sep = ",", index = False)

# NP.NAN

In [19]:
fig = px.bar([df_[x].dropna().count() for x in df_.columns], y = df_.columns, x = [df_[x].dropna().count() for x in df_.columns])

fig.update_layout(height = 1000)

In [20]:
def nanPercentage(col):
    orig_size = len(col)
    drop_size = len(col.dropna())
    return 1 - drop_size/orig_size

def nanReport(threshold):
    df = df_[[col for col in df_.columns if nanPercentage(df_[col]) < threshold]]
    print(f'Porcentaje de valores perdidos en total: {1- df.dropna().shape[0] / df.shape[0]}')
    print(f'Columnas conservadas: {len(df.columns)}')
    return df

# CONCLUSION: HAY MUCHOS NANS

In [21]:
df_1 = nanReport(0.3)

Porcentaje de valores perdidos en total: 0.2995516827682011
Columnas conservadas: 11


In [22]:
df_1

Unnamed: 0,price,title,location,lat,lng,Referencia,Clasificación,Baños,Conservación,Superficie construida,Habitaciones
0,176000.0,Apartamento en venta en Abantos-Monte Carmelo-...,Abantos-Monte Carmelo-El Rosario (San Lorenzo ...,40.593445,-4.145386,4994318-VF-812,En trámite,1,A estrenar,63 m²,1
1,116500.0,"Apartamento en venta en Calle de Ercilla, 20, ...","Calle de Ercilla, 20, cerca de Calle del Labra...",40.402079,-3.702151,EP805-1309,En trámite,1,,25 m²,
2,169000.0,Apartamento en venta en Ajalvir,Ajalvir,40.534457,-3.479415,IFC76153-1690TN,En trámite,1,En buen estado,70 m²,1
3,174000.0,Apartamento en venta en Alcorcón,Casco Antiguo (Alcorcón),40.344358,-3.825283,SA1398-4340/6592,No indicado,1,,60 m²,1
4,130000.0,Apartamento en venta en Alcorcón,Casco Antiguo (Alcorcón),40.347096,-3.827826,SA2135-04930/7619,No indicado,1,,60 m²,2
...,...,...,...,...,...,...,...,...,...,...,...
15832,135000.0,Piso en venta en Calle de la Circunvalación,Calle de la Circunvalación. Zona Suroeste (Tor...,40.454606,-3.455234,SA4463-BANCO-0001,No indicado,1,A reformar,67 m²,3
15833,210000.0,Piso en venta en Calle de Urano,Calle de Urano. Zona Suroeste (Torrejón de Ardoz),40.441928,-3.473036,SA3697-IL-04891,En trámite,2,En buen estado,79 m²,1
15834,324600.0,Piso en venta en Fresnos,Zona Suroeste (Torrejón de Ardoz),40.448403,-3.470899,SA555-39508999/1670,No indicado,3,,140 m²,4
15835,249900.0,Piso en venta en Calle de Cibeles,Calle de Cibeles. Zona Suroeste (Torrejón de A...,40.444707,-3.473520,4525678-000021,En trámite,2,En buen estado,106 m²,3


In [23]:
df_2 = df_1[['price', 'lat', 'lng', 'Baños', 'Habitaciones', 'Superficie construida', 'Conservación']].dropna()

In [24]:
df_2['Superficie construida'] = df_2['Superficie construida'].apply(lambda x: int(''.join(x[:-3].split('.'))))

In [25]:
df_2 = pd.concat([df_2.drop('Conservación', axis = 1), pd.get_dummies(df_2['Conservación'], drop_first = True)], axis = 1)

In [26]:
X = df_2.drop('price', axis = 1)
y = df_2[['price']]

In [27]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [28]:
model = RandomForestRegressor()

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [30]:
model.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



In [31]:
yhat = model.predict(X_test)

In [32]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [33]:
print(r2_score(y_test, yhat))
print(mean_absolute_error(y_test, yhat))
print(mean_squared_error(y_test, yhat))

0.8263024859973126
115384.43226327674
107903074243.75594
