<div class="alert alert-block alert-success">

# 1. Import Data <a id='import'></a></div>

<div class="alert alert-block alert-warning">

### 1.1 Import the Required Libraries <a id='libraries'></a></div>

In [222]:
import pandas as pd
import numpy as np
from numpy import nan
import re
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA

import geopandas as gpd
import folium
import osmnx as ox
import networkx as nx
import geopy.distance
import warnings
import statistics
import statsmodels.api as sm
import io
from PIL import Image
warnings.filterwarnings('ignore')

<div class="alert alert-block alert-warning">

### 1.2 Import Data <a id='data'></a></div>

In [227]:
# Webscrapped housing data
houses = pd.read_csv("data/properties_data.csv")

# Geodata
hospitals = gpd.read_file("data/Hospitals/Hospitals.shp")
metro_stations = gpd.read_file("data/MetroStations/MetroStations.shp")
train_stations = gpd.read_file("data/TrainStations/TrainStations.shp")
green_areas = gpd.read_file("data/GreenAreas/GreenAreas.shp")
universities = gpd.read_file("data/Universities/Universities.shp")
shopping_centers = gpd.read_file("data/ShoppingCenters/ShoppingCenters.shp")
lisbon_limits = gpd.read_file("data/LisbonLimits/LisbonLimits.shp")
lisbon_limits_parish = gpd.read_file("data/LisbonLimitsFreguesia/LisbonLimitsFreguesia.shp")

# National Statistics Intitute data - median price per parish of solds houses 2023

house_prices_parish = pd.read_excel("data/price_per_parish_2023.xlsx")

<div class="alert alert-block alert-success">

# 2. Data Exploration and Preprocessing <a id='initial_explore'></a></div>

<div class="alert alert-block alert-warning">

### 2.1 Clean each Feature Individually <a id='data'></a></div>

In [4]:
houses.head(10)

Unnamed: 0,id,property-header,property_description,typology,price,area,locality,region,latitude,longitude
0,1000,,"Apartamento T4 convertido em T3, em bom estado...",Apartamento T3,489.500 €,115m²,Lisboa,Arroios,38.726588,-9.142284
1,1001,Apartamento T4 á venda em Arroios,"Excelente Apartamento T4, na zona de Arroios, ...",Apartamento T4 Triplex,330.000 €,74m²,Lisboa,Arroios,38.721836,-9.138774
2,1002,,Arrecadação dentro do Amoreiras Shopping Cente...,Armazém,39.000 € / 220 €,,Lisboa,Campo de Ourique,0.0,0.0
3,1003,,"Apartamento de luxo situado em Telheiras, Parq...",Apartamento T4,865.000 €,,Lisboa,Lumiar,0.0,0.0
4,1004,,Apartamento T3 remodelado no Alto do Restelo (...,Apartamento T3,2.000 €,Recuperado,Lisboa,Belém,0.0,0.0
5,1005,,"Loja com 1.688,77 m2, em 3 pisos, com localiza...",Loja,2.980.000 €,1 688m²,Lisboa,Areeiro,38.743644,-9.143469
6,1006,,Este bonito apartamento esta inserido numa rea...,Apartamento T1,430.000 €,81m²,Lisboa,Alcântara,38.70623,-9.17753
7,1007,APARTAMENTO T4 EXCELENTE COM VISTA RIO EM LISB...,PROCURA APARTAMENTO T4 NA CIDADE DE LISBOA?EST...,Apartamento T4,4.000 €,230m²,Lisboa,Campo de Ourique,38.723605,-9.158138
8,1008,APARTAMENTO T3 COM LOGRADOURO E ESTACIONAMENTO...,"PROCURA UMA CASA PERTO DO RIO, DO CENTRO DA CI...",Apartamento T3,1.150.000 €,Novo · 200m²,Lisboa,Olivais,38.76961,-9.108405
9,1009,T3 no 5ºandar remodelado na Av. Columbano Bord...,T3 no 5ºandar remodelado na Av. Columbano Bord...,Apartamento T3,2.600 €,Renovado · 133m²,Lisboa,Campolide,38.73805,-9.16185


In [5]:
#check initial number of properties
print('Total number of properties: {}'.format(houses.shape[0]))

Total number of properties: 13259


<div class="alert alert-block alert-info">

#### Remove non-residential properties

In [6]:
#keep only if it referes to residential properties

condition1 = houses['typology'].str.contains('Apartamento')
condition2 = houses['typology'].str.contains('Moradia')
condition3 = houses['typology'].str.contains('Duplex')

houses = houses[condition1 | condition2 | condition3].reset_index(drop=True).copy()

In [7]:
#check number of properties after exluding non residential properties 

print('Total number of properties: {}'.format(houses.shape[0]))

Total number of properties: 9884


In [8]:
houses.head()

Unnamed: 0,id,property-header,property_description,typology,price,area,locality,region,latitude,longitude
0,1000,,"Apartamento T4 convertido em T3, em bom estado...",Apartamento T3,489.500 €,115m²,Lisboa,Arroios,38.726588,-9.142284
1,1001,Apartamento T4 á venda em Arroios,"Excelente Apartamento T4, na zona de Arroios, ...",Apartamento T4 Triplex,330.000 €,74m²,Lisboa,Arroios,38.721836,-9.138774
2,1003,,"Apartamento de luxo situado em Telheiras, Parq...",Apartamento T4,865.000 €,,Lisboa,Lumiar,0.0,0.0
3,1004,,Apartamento T3 remodelado no Alto do Restelo (...,Apartamento T3,2.000 €,Recuperado,Lisboa,Belém,0.0,0.0
4,1006,,Este bonito apartamento esta inserido numa rea...,Apartamento T1,430.000 €,81m²,Lisboa,Alcântara,38.70623,-9.17753


In [9]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9884 entries, 0 to 9883
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    9884 non-null   int64  
 1   property-header       142 non-null    object 
 2   property_description  9806 non-null   object 
 3   typology              9884 non-null   object 
 4   price                 9722 non-null   object 
 5   area                  9880 non-null   object 
 6   locality              9884 non-null   object 
 7   region                9884 non-null   object 
 8   latitude              9884 non-null   float64
 9   longitude             9884 non-null   float64
dtypes: float64(2), int64(1), object(7)
memory usage: 772.3+ KB


<div class="alert alert-block alert-info">

#### Clean tipology feature

In [10]:
# clean typology original column by keeping only string 'Tx' where x is number of bedrooms

typology_total = houses['typology'].str.split()
typology_clean = []

for i in range (len(typology_total)):
    typology_filtered = list(x for x in typology_total[i] if x.startswith('T'))
    if typology_filtered == []:
        typology_clean.append(nan)
    else:
        typology_clean.append(typology_filtered[0])

In [11]:
# there are some cases where typology is 'Tx+y', so we will add up x and y to have a single value for number of bedrooms

for i in range (len(typology_clean)):
    if type(typology_clean[i])!=float:
        if '+' in typology_clean[i]:
            sumation = int(typology_clean[i].split('+')[0].split('T')[1]) + int(typology_clean[i].split('+')[1])
            typology_clean[i] = 'T'+str(sumation)

In [12]:
# replace typology original column with computed list
houses['typology'] = typology_clean

In [13]:
print('Unfortunately, there are {} for which no typology was found. We will attempt \
to fill these missing values later.'.format(sum(houses['typology'].isna())))

Unfortunately, there are 178 for which no typology was found. We will attempt to fill these missing values later.


<div class="alert alert-block alert-info">

#### Clean area feature

In [14]:
area_total = houses['area'].str.split()
area_clean = []

for i in range (len(area_total)):
    if type(area_total[i])!=float:
        area_filtered = list(x for x in area_total[i] if 'm²' in x)
        if area_filtered == []:
            area_clean.append(nan)
        else:
            area_clean.append(int(area_filtered[0].replace('m²', '')))
    else:
        area_clean.append(nan)

In [15]:
# replace area original column with computed list
houses['area'] = area_clean

In [16]:
print('Unfortunately, there are {} for which no area value was found. \
We will drop these observations.'.format(sum(houses['area'].isna())))

Unfortunately, there are 62 for which no area value was found. We will drop these observations.


In [79]:
houses = houses[houses['area'].notna()].reset_index(drop=True)

<div class="alert alert-block alert-info">

#### Clean price feature

In [18]:
price_total = houses['price'].str.split()
price_clean = []

for i in range (len(price_total)):
    if type(price_total[i])!=float:
        price_clean.append(int(price_total[i][0].replace('.', '')))
    else:
        price_clean.append(nan)

In [19]:
price_total = houses['price'].str.split()

In [20]:
# replace price original column with computed list
houses['price'] = price_clean

In [21]:
print('Unfortunately, there are {} for which no price value was found. \
We will drop these observations.'.format(sum(houses['price'].isna())))

Unfortunately, there are 158 for which no price value was found. We will drop these observations.


In [22]:
houses = houses[houses['price'].notna()].reset_index(drop=True)

<div class="alert alert-block alert-info">

#### Clean latitude and longitude feature

In [42]:
print('Unfortunately, there are {} for which no longitude or latitude values were found. \
We will drop these observations.'.format(len(houses[(houses['latitude'] == 0) | (houses['longitude'] == 0)])))

Unfortunately, there are 5 for which no longitude or latitude values were found. We will drop these observations.


In [44]:
houses = houses[(houses['latitude']!= 0) | (houses['longitude']!= 0)].reset_index(drop=True)