# PREPROCESSING

In [269]:
import pandas as pd
import numpy as np
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from processing.parsing import *
from processing.encodings import *

In [270]:
df_ = pd.read_csv('../data/madrid_preprocessed.csv')

In [271]:
df_sample = df_.sample(round(df_.shape[0]*0.15), random_state=42)
df_ = df_.reset_index()[~df_.reset_index()['index'].isin(df_sample.reset_index()['index'])].drop(['index'], axis=1)

In [272]:
df_ = outliersFilter(df = df_, min_price = 50_000, max_price = 5_000_000, max_baths = 15, max_surface = 1_000)
df_

Unnamed: 0,price,lat,lng,updated,type,bulevar,paseo,carretera,parque,calle,...,publisher,age,garage,lift,surface,net_surface,garden,rooms,condition,bathrooms
1,116500.0,40.402079,-3.702151,1.674947e+09,Apartamento,2,2,2,2,2,...,inmobiliaria,50.0,no,yes,25.0,,no,,,1.0
2,169000.0,40.534457,-3.479415,1.673392e+09,Apartamento,0,0,0,0,0,...,inmobiliaria,,no,yes,70.0,67.0,no,1.0,En buen estado,1.0
4,130000.0,40.347096,-3.827826,1.672615e+09,Apartamento,0,0,0,0,0,...,inmobiliaria,,no,yes,60.0,54.0,no,2.0,,1.0
5,88000.0,40.031830,-3.599734,1.674947e+09,Apartamento,2,2,2,2,2,...,inmobiliaria,10.0,no,yes,37.0,30.0,no,1.0,En buen estado,1.0
6,115000.0,40.031169,-3.598471,1.672874e+09,Apartamento,2,2,2,2,2,...,inmobiliaria,10.0,no,yes,40.0,40.0,no,1.0,En buen estado,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15837,189450.0,40.441125,-3.466229,1.672960e+09,Piso,0,0,0,0,0,...,inmobiliaria,,no,no,88.0,,no,2.0,A estrenar,2.0
15840,156800.0,40.453200,-3.472508,1.672787e+09,Piso,0,0,0,0,0,...,inmobiliaria,,no,yes,90.0,,no,3.0,,1.0
15841,135000.0,40.454606,-3.455234,1.672960e+09,Piso,1,1,1,1,1,...,inmobiliaria,,no,yes,67.0,62.0,no,3.0,A reformar,1.0
15842,210000.0,40.441928,-3.473036,1.672615e+09,Piso,1,1,1,1,1,...,inmobiliaria,,yes,yes,79.0,63.0,no,1.0,En buen estado,2.0


In [273]:
df = df_.copy()

# type encoding

In [274]:
# df_type, encodings_type = targetEncoding(df[['type']], df['price'], condition = 'target_mean')
df_type, encodings_type = frequencyEncoding(df[['type']])
df['type'] = df_type['type']

# garage, lift, garden and publisher encoding

In [275]:
df = binaryEncoding(df)

# condition encoding

In [276]:
df_condition, encodings_condition = frequencyEncoding(df[['condition']])
df['condition'] = df_condition['condition']

In [277]:
df

Unnamed: 0,price,lat,lng,updated,type,bulevar,paseo,carretera,parque,calle,...,publisher,age,garage,lift,surface,net_surface,garden,rooms,condition,bathrooms
1,116500.0,40.402079,-3.702151,1.674947e+09,228,2,2,2,2,2,...,1,50.0,0,1,25.0,,0,,,1.0
2,169000.0,40.534457,-3.479415,1.673392e+09,228,0,0,0,0,0,...,1,,0,1,70.0,67.0,0,1.0,6379.0,1.0
4,130000.0,40.347096,-3.827826,1.672615e+09,228,0,0,0,0,0,...,1,,0,1,60.0,54.0,0,2.0,,1.0
5,88000.0,40.031830,-3.599734,1.674947e+09,228,2,2,2,2,2,...,1,10.0,0,1,37.0,30.0,0,1.0,6379.0,1.0
6,115000.0,40.031169,-3.598471,1.672874e+09,228,2,2,2,2,2,...,1,10.0,0,1,40.0,40.0,0,1.0,6379.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15837,189450.0,40.441125,-3.466229,1.672960e+09,8515,0,0,0,0,0,...,1,,0,0,88.0,,0,2.0,1201.0,2.0
15840,156800.0,40.453200,-3.472508,1.672787e+09,8515,0,0,0,0,0,...,1,,0,1,90.0,,0,3.0,,1.0
15841,135000.0,40.454606,-3.455234,1.672960e+09,8515,1,1,1,1,1,...,1,,0,1,67.0,62.0,0,3.0,1208.0,1.0
15842,210000.0,40.441928,-3.473036,1.672615e+09,8515,1,1,1,1,1,...,1,,1,1,79.0,63.0,0,1.0,6379.0,2.0


In [278]:
df_surface = df[['surface', 'net_surface']].dropna()
np.corrcoef(df_surface['surface'], df_surface['net_surface'])

array([[1.        , 0.97416502],
       [0.97416502, 1.        ]])

# TRAINING

In [279]:
df.drop('price', axis = 1).columns

Index(['lat', 'lng', 'updated', 'type', 'bulevar', 'paseo', 'carretera',
       'parque', 'calle', 'autovia', 'avenida', 'plaza', 'publisher', 'age',
       'garage', 'lift', 'surface', 'net_surface', 'garden', 'rooms',
       'condition', 'bathrooms'],
      dtype='object')

In [280]:
df = df[~df['price'].isna()]

X = df.drop(['price', 'net_surface'], axis = 1)
y = df[['price']]

imputer = KNNImputer(n_neighbors=3)

X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

pd.DataFrame([[r2_score(y_test, yhat),
              mean_absolute_error(y_test, yhat),
              mean_squared_error(y_test, yhat)]],
             columns = ['r2', 'mae', 'mse'])

  return fit_method(estimator, *args, **kwargs)


Unnamed: 0,r2,mae,mse
0,0.84125,112727.792288,60339550000.0


# Validation

In [281]:
df_sample

Unnamed: 0,price,lat,lng,updated,type,bulevar,paseo,carretera,parque,calle,...,publisher,age,garage,lift,surface,net_surface,garden,rooms,condition,bathrooms
8466,226000.0,40.729647,-3.582455,1.672615e+09,Piso,0,0,0,0,0,...,inmobiliaria,,yes,yes,150.0,108.0,no,3.0,,2.0
2764,428500.0,40.442969,-3.464236,1.673997e+09,Casa,1,1,1,1,1,...,inmobiliaria,,no,no,177.0,,no,4.0,A estrenar,3.0
10979,479000.0,40.510236,-3.694039,1.674861e+09,Piso,0,0,0,0,0,...,inmobiliaria,,yes,yes,120.0,84.0,no,3.0,En buen estado,2.0
2183,1350000.0,40.413838,-3.791630,1.672787e+09,Casa,0,0,0,0,0,...,inmobiliaria,,yes,no,450.0,,no,6.0,En buen estado,5.0
11882,210190.0,40.229607,-3.749955,1.675120e+09,Piso,1,1,1,1,1,...,inmobiliaria,,yes,yes,72.0,,no,2.0,A estrenar,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683,99950.0,40.296712,-3.302587,1.672614e+09,Casa,0,0,0,0,0,...,inmobiliaria,,no,no,248.0,,no,5.0,En buen estado,3.0
4121,350000.0,40.742860,-3.511755,1.672960e+09,Chalet,0,0,0,0,0,...,inmobiliaria,30.0,yes,no,182.0,149.0,yes,4.0,En buen estado,2.0
9365,480000.0,40.674201,-4.087623,1.674256e+09,Piso,0,0,0,0,0,...,inmobiliaria,,no,yes,340.0,,no,5.0,,3.0
5955,145000.0,40.448851,-3.705134,1.672874e+09,Piso,0,0,0,0,0,...,inmobiliaria,,no,no,40.0,,no,,En buen estado,1.0


In [282]:
df_sample['type'] = df_sample['type'].replace(encodings_type['type'])

df_sample = binaryEncoding(df)

df_sample['condition'] = df_sample['condition'].replace(encodings_condition['condition'])

df_validation = df_sample.dropna()

In [283]:
X = df_validation.drop(['price', 'net_surface'], axis = 1)
y = df_validation['price']

yhat = model.predict(X)

pd.DataFrame([[r2_score(y, yhat),
              mean_absolute_error(y, yhat),
              mean_squared_error(y, yhat)]],
             columns = ['r2', 'mae', 'mse'])



Unnamed: 0,r2,mae,mse
0,0.818413,85591.647998,62972780000.0
