# PREPROCESSING

In [72]:
import pandas as pd
import numpy as np
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from parsing import *

In [38]:
df = pd.read_csv('madrid.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [39]:
df['price'] = df['price'].map(lambda x: tryInt(x))
df['characteristics'] = df['characteristics'].apply(tryLiteralEval)

In [40]:
data = freeChurro(df)

In [41]:
df_characteristics = pd.DataFrame(data)

In [42]:
df = pd.concat([df, df_characteristics], axis = 1).drop('characteristics', axis = 1)

In [43]:
# Está guardado el DataFrame que tiene ya todas las características como columnas:
# df_ = pd.read_csv('madrid_preprocessed.csv')
df_ = pd.read_csv('madrid_outliers.csv')
df_

Unnamed: 0,price,updated,surface,lat,lng,type,paseo,carretera,bulevar,autovia,...,avenida,publisher,age,garage,lift,net_surface,garden,rooms,condition,bathrooms
0,176000.0,1.674861e+09,63.0,40.593445,-4.145386,Apartamento,0,0,0,0,...,0,inmobiliaria,10.0,yes,yes,46.0,yes,1.0,A estrenar,1.0
1,116500.0,1.674947e+09,25.0,40.402079,-3.702151,Apartamento,2,2,2,2,...,2,inmobiliaria,50.0,no,yes,,no,,,1.0
2,169000.0,1.673392e+09,70.0,40.534457,-3.479415,Apartamento,0,0,0,0,...,0,inmobiliaria,,no,yes,67.0,no,1.0,En buen estado,1.0
3,174000.0,1.672960e+09,60.0,40.344358,-3.825283,Apartamento,0,0,0,0,...,0,inmobiliaria,,no,yes,50.0,no,1.0,,1.0
4,130000.0,1.672615e+09,60.0,40.347096,-3.827826,Apartamento,0,0,0,0,...,0,inmobiliaria,,no,yes,54.0,no,2.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,135000.0,1.672960e+09,67.0,40.454606,-3.455234,Piso,1,1,1,1,...,1,inmobiliaria,,no,yes,62.0,no,3.0,A reformar,1.0
12796,210000.0,1.672615e+09,79.0,40.441928,-3.473036,Piso,1,1,1,1,...,1,inmobiliaria,,yes,yes,63.0,no,1.0,En buen estado,2.0
12797,324600.0,1.672874e+09,140.0,40.448403,-3.470899,Piso,0,0,0,0,...,0,inmobiliaria,,yes,yes,,no,4.0,,3.0
12798,249900.0,1.672874e+09,106.0,40.444707,-3.473520,Piso,1,1,1,1,...,1,inmobiliaria,20.0,yes,no,96.0,no,3.0,En buen estado,2.0


In [44]:
df_text = df[['Cocina', 'Cocina equipada', 'Superficie solar', 'Sistema de seguridad', 'Portero automático',
                      'Aire acondicionado', 'Puerta blindada', 'Amueblado', 'Trastero', 'Calefacción', 'Gas',
                      'Tipo de casa', 'Piscina', 'Vidrios dobles', 'Planta', 'Tipo suelo', 'Balcón', 'Orientación',
                      'Armarios empotrados', 'Comedor']]

df_text['concatenated'] = df_text.apply(lambda row: ' '.join([str(val) if pd.notna(val) else '' for val in row]), axis=1)
df_text['concatenated'] = df_text['concatenated'].apply(lambda x: x.strip())
df_text['concatenated'] = df_text['concatenated'].apply(lambda x: ' '.join([y.strip() for y in x.split()]))

texts = df_text['concatenated'].to_list()

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['concatenated'] = df_text.apply(lambda row: ' '.join([str(val) if pd.notna(val) else '' for val in row]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['concatenated'] = df_text['concatenated'].apply(lambda x: x.strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tex

In [45]:
df_text[['concatenated']]

Unnamed: 0,concatenated
0,Amueblada a estrenar Videoportero Comunitaria ...
1,Cocina 2ª
2,
3,Puerta blindada Amueblado 3ª Norte 1
4,Puerta blindada Eléctrica 1ª Oeste
...,...
15841,Sur 1
15842,Aire acondicionado Trastero Piscina Norte 1
15843,Aire acondicionado Trastero Gas natural Comuni...
15844,Portero automático Conducto Puerta blindada Tr...


In [46]:
def get_best_k(X, n_k, model = KMeans()):
    inertias = list()
    
    for k in n_k:
        model.n_clusters = k
        model.fit(X)
        inertias.append(model.inertia_)
        
    k_i = np.array([[k, i] for k, i in enumerate(inertias, start = 1)])

    k_i_max = np.max(k_i, axis=0)
    k_i_norm = k_i / k_i_max
    
    distances = [np.linalg.norm([point, [0,0]]) for point in k_i_norm]
    
    return pd.concat([pd.DataFrame([x for x in range(1,n_k[-1] + 1)], columns = ['K']), pd.DataFrame(distances, columns = ['Distance'])], axis = 1).sort_values('Distance')

In [47]:
best_k = get_best_k(X,range(1,61)).reset_index(drop = True)['K'].iloc[0]

  super()._check_params_vs_input(X, default_n_init=10)


In [48]:
kmeans = KMeans(n_clusters = best_k)
kmeans.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [49]:
# df_['cluster'] = kmeans.labels_
df_ = pd.concat([df_, pd.DataFrame(kmeans.labels_)], axis = 1)
df_ = df_[~df_['price'].isna()]
df_ = df_.rename(columns = {0 : 'cluster'})
df_

Unnamed: 0,price,updated,surface,lat,lng,type,paseo,carretera,bulevar,autovia,...,publisher,age,garage,lift,net_surface,garden,rooms,condition,bathrooms,cluster
0,176000.0,1.674861e+09,63.0,40.593445,-4.145386,Apartamento,0.0,0.0,0.0,0.0,...,inmobiliaria,10.0,yes,yes,46.0,yes,1.0,A estrenar,1.0,0
1,116500.0,1.674947e+09,25.0,40.402079,-3.702151,Apartamento,2.0,2.0,2.0,2.0,...,inmobiliaria,50.0,no,yes,,no,,,1.0,17
2,169000.0,1.673392e+09,70.0,40.534457,-3.479415,Apartamento,0.0,0.0,0.0,0.0,...,inmobiliaria,,no,yes,67.0,no,1.0,En buen estado,1.0,0
3,174000.0,1.672960e+09,60.0,40.344358,-3.825283,Apartamento,0.0,0.0,0.0,0.0,...,inmobiliaria,,no,yes,50.0,no,1.0,,1.0,6
4,130000.0,1.672615e+09,60.0,40.347096,-3.827826,Apartamento,0.0,0.0,0.0,0.0,...,inmobiliaria,,no,yes,54.0,no,2.0,,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,135000.0,1.672960e+09,67.0,40.454606,-3.455234,Piso,1.0,1.0,1.0,1.0,...,inmobiliaria,,no,yes,62.0,no,3.0,A reformar,1.0,3
12796,210000.0,1.672615e+09,79.0,40.441928,-3.473036,Piso,1.0,1.0,1.0,1.0,...,inmobiliaria,,yes,yes,63.0,no,1.0,En buen estado,2.0,14
12797,324600.0,1.672874e+09,140.0,40.448403,-3.470899,Piso,0.0,0.0,0.0,0.0,...,inmobiliaria,,yes,yes,,no,4.0,,3.0,7
12798,249900.0,1.672874e+09,106.0,40.444707,-3.473520,Piso,1.0,1.0,1.0,1.0,...,inmobiliaria,20.0,yes,no,96.0,no,3.0,En buen estado,2.0,11


In [50]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12800 entries, 0 to 12799
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        12800 non-null  float64
 1   updated      12800 non-null  float64
 2   surface      12800 non-null  float64
 3   lat          12763 non-null  float64
 4   lng          12763 non-null  float64
 5   type         12800 non-null  object 
 6   paseo        12800 non-null  float64
 7   carretera    12800 non-null  float64
 8   bulevar      12800 non-null  float64
 9   autovia      12800 non-null  float64
 10  calle        12800 non-null  float64
 11  parque       12800 non-null  float64
 12  plaza        12800 non-null  float64
 13  avenida      12800 non-null  float64
 14  publisher    12800 non-null  object 
 15  age          2942 non-null   float64
 16  garage       12800 non-null  object 
 17  lift         12800 non-null  object 
 18  net_surface  6154 non-null   float64
 19  garden   

In [51]:
df = df_.copy()

# type encoding

In [52]:
types = df['type'].unique()
types_dict = {type_ : x for type_, x in zip(types, range(len(types)))}

In [53]:
df['type'] = df['type'].replace(types_dict)

# garage, lift, garden and publisher encoding

In [54]:
df['garage'] = df['garage'].apply(lambda x: 1 if x == 'yes' else 0)
df['lift'] = df['lift'].apply(lambda x: 1 if x == 'yes' else 0)
df['garden'] = df['garden'].apply(lambda x: 1 if x == 'yes' else 0)
df['publisher'] = df['publisher'].apply(lambda x: 1 if x == 'inmobiliaria' else 0)

# condition encoding

In [55]:
conditions = df['condition'].unique()
conditions_dict = {cond_ : x for cond_, x in zip(conditions, range(len(conditions)))}

In [56]:
df['condition'] = df['condition'].replace(conditions_dict)

In [57]:
df_surface = df[['surface', 'net_surface']].dropna()
np.corrcoef(df_surface['surface'], df_surface['net_surface'])

array([[1.        , 0.88291829],
       [0.88291829, 1.        ]])

In [58]:
df

Unnamed: 0,price,updated,surface,lat,lng,type,paseo,carretera,bulevar,autovia,...,publisher,age,garage,lift,net_surface,garden,rooms,condition,bathrooms,cluster
0,176000.0,1.674861e+09,63.0,40.593445,-4.145386,0,0.0,0.0,0.0,0.0,...,1,10.0,1,1,46.0,1,1.0,0,1.0,0
1,116500.0,1.674947e+09,25.0,40.402079,-3.702151,0,2.0,2.0,2.0,2.0,...,1,50.0,0,1,,0,,1,1.0,17
2,169000.0,1.673392e+09,70.0,40.534457,-3.479415,0,0.0,0.0,0.0,0.0,...,1,,0,1,67.0,0,1.0,2,1.0,0
3,174000.0,1.672960e+09,60.0,40.344358,-3.825283,0,0.0,0.0,0.0,0.0,...,1,,0,1,50.0,0,1.0,1,1.0,6
4,130000.0,1.672615e+09,60.0,40.347096,-3.827826,0,0.0,0.0,0.0,0.0,...,1,,0,1,54.0,0,2.0,1,1.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,135000.0,1.672960e+09,67.0,40.454606,-3.455234,8,1.0,1.0,1.0,1.0,...,1,,0,1,62.0,0,3.0,3,1.0,3
12796,210000.0,1.672615e+09,79.0,40.441928,-3.473036,8,1.0,1.0,1.0,1.0,...,1,,1,1,63.0,0,1.0,2,2.0,14
12797,324600.0,1.672874e+09,140.0,40.448403,-3.470899,8,0.0,0.0,0.0,0.0,...,1,,1,1,,0,4.0,1,3.0,7
12798,249900.0,1.672874e+09,106.0,40.444707,-3.473520,8,1.0,1.0,1.0,1.0,...,1,20.0,1,0,96.0,0,3.0,2,2.0,11


# TRAINING

In [70]:
df = df[~df['price'].isna()]

X = df.drop(['price', 'net_surface'], axis = 1)
y = df[['price']]

imputer = KNNImputer(n_neighbors=3)

X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

print(r2_score(y_test, yhat))
print(mean_absolute_error(y_test, yhat))
print(mean_squared_error(y_test, yhat))

  return fit_method(estimator, *args, **kwargs)


0.8552358569224254
57570.32250290178
9096971043.713818


In [60]:
df.drop('price', axis = 1).columns

Index(['updated', 'surface', 'lat', 'lng', 'type', 'paseo', 'carretera',
       'bulevar', 'autovia', 'calle', 'parque', 'plaza', 'avenida',
       'publisher', 'age', 'garage', 'lift', 'net_surface', 'garden', 'rooms',
       'condition', 'bathrooms', 'cluster'],
      dtype='object')

In [61]:
feature_importances = {y : x for x, y in zip(model.feature_importances_, X.columns)}

In [62]:
def extractImportances(data, percentage):
    # Sort the dictionary items by values in descending order
    sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)

    cumulative_sum = 0
    selected_keys = []

    for key, value in sorted_items:
        print(key, value)
        cumulative_sum += value
        selected_keys.append(key)

        if cumulative_sum >= percentage:
            break

    return selected_keys

In [63]:
importances = extractImportances(feature_importances, 0.9)

lat 0.31445009167309523
bathrooms 0.2913328407150224
lng 0.19981698531710845
surface 0.10788331174721849


In [64]:
X = df.drop(['price'], axis = 1)[importances]
y = df[['price']]

imputer = KNNImputer(n_neighbors=3)

X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

print(r2_score(y_test, yhat))
print(mean_absolute_error(y_test, yhat))
print(mean_squared_error(y_test, yhat))

  return fit_method(estimator, *args, **kwargs)


0.8488073406452761
57816.677049979204
9500938664.31399


In [65]:
importances

['lat', 'bathrooms', 'lng', 'surface']

In [66]:
importances = extractImportances(feature_importances, 0.8)

lat 0.31445009167309523
bathrooms 0.2913328407150224
lng 0.19981698531710845


In [67]:
X = df.drop(['price'], axis = 1)[importances]
y = df[['price']]

imputer = KNNImputer(n_neighbors=3)

X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

print(r2_score(y_test, yhat))
print(mean_absolute_error(y_test, yhat))
print(mean_squared_error(y_test, yhat))

  return fit_method(estimator, *args, **kwargs)


0.8017513967223495
69098.76001628647
12457931675.159887


In [68]:
importances

['lat', 'bathrooms', 'lng']

# Algoritmos

In [88]:
models = {'LR' : LinearRegression(n_jobs = -1),
          'DT' : DecisionTreeRegressor(random_state = 42),
          'RF' : RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100),
          'SVR' : SVR(),
          'KN' : KNeighborsRegressor(n_jobs = -1, n_neighbors = 3),}
        #   'RN' : RadiusNeighborsRegressor(n_jobs = -1,radius = 0.3)}

metrics_cols = ['model', 'r2_score', 'mean_squared_error']
metrics = pd.DataFrame(columns = metrics_cols)

df = df[~df['price'].isna()]

X = df.drop(['price', 'net_surface'], axis = 1)
y = df[['price']]

imputer = KNNImputer(n_neighbors=3)
X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size = 0.2, random_state = 42)

for enum, m in enumerate(models.keys()):
    
    model = models[m]
    model.fit(X_train, y_train)
    yhat = model.predict(X_test)
    
    updating_metrics = pd.DataFrame([[model, r2_score(y_test, yhat), mean_squared_error(y_test, yhat)]], columns = metrics_cols)
    metrics = pd.concat([metrics, updating_metrics], axis = 0, ignore_index = True)
    
    print(f'({enum+1}) - {models[m]} completed')
    print('---'*30)

(1) - LinearRegression(n_jobs=-1) completed
------------------------------------------------------------------------------------------
(2) - DecisionTreeRegressor(random_state=42) completed
------------------------------------------------------------------------------------------


  return fit_method(estimator, *args, **kwargs)


(3) - RandomForestRegressor(n_jobs=-1, random_state=42) completed
------------------------------------------------------------------------------------------


  y = column_or_1d(y, warn=True)


(4) - SVR() completed
------------------------------------------------------------------------------------------
(5) - KNeighborsRegressor(n_jobs=-1, n_neighbors=3) completed
------------------------------------------------------------------------------------------


In [89]:
metrics.sort_values('r2_score', ascending = False)

Unnamed: 0,model,r2_score,mean_squared_error
2,"(DecisionTreeRegressor(max_features=1.0, rando...",0.855236,9096971000.0
1,DecisionTreeRegressor(random_state=42),0.734132,16707140000.0
0,LinearRegression(n_jobs=-1),0.434109,35560560000.0
4,"KNeighborsRegressor(n_jobs=-1, n_neighbors=3)",0.093414,56969810000.0
3,SVR(),-0.129968,71007140000.0
