# PREPROCESSING

In [53]:
import pandas as pd
import numpy as np
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from parsing import *

In [54]:
df = pd.read_csv('madrid.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [55]:
df['price'] = df['price'].map(lambda x: tryInt(x))
df['characteristics'] = df['characteristics'].apply(tryLiteralEval)

In [56]:
data = freeChurro(df)

In [57]:
df_characteristics = pd.DataFrame(data)

In [58]:
df = pd.concat([df, df_characteristics], axis = 1).drop('characteristics', axis = 1)

In [59]:
# Está guardado el DataFrame que tiene ya todas las características como columnas:
df_ = pd.read_csv('madrid_preprocessed.csv')
df_

Unnamed: 0,price,lat,lng,updated,type,calle,bulevar,plaza,autovia,carretera,...,publisher,age,garage,lift,surface,net_surface,garden,rooms,condition,bathrooms
0,176000.0,40.593445,-4.145386,1.674861e+09,Apartamento,0,0,0,0,0,...,inmobiliaria,10.0,yes,yes,63.0,46.0,yes,1.0,A estrenar,1.0
1,116500.0,40.402079,-3.702151,1.674947e+09,Apartamento,2,2,2,2,2,...,inmobiliaria,50.0,no,yes,25.0,,no,,,1.0
2,169000.0,40.534457,-3.479415,1.673392e+09,Apartamento,0,0,0,0,0,...,inmobiliaria,,no,yes,70.0,67.0,no,1.0,En buen estado,1.0
3,174000.0,40.344358,-3.825283,1.672960e+09,Apartamento,0,0,0,0,0,...,inmobiliaria,,no,yes,60.0,50.0,no,1.0,,1.0
4,130000.0,40.347096,-3.827826,1.672615e+09,Apartamento,0,0,0,0,0,...,inmobiliaria,,no,yes,60.0,54.0,no,2.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15841,135000.0,40.454606,-3.455234,1.672960e+09,Piso,1,1,1,1,1,...,inmobiliaria,,no,yes,67.0,62.0,no,3.0,A reformar,1.0
15842,210000.0,40.441928,-3.473036,1.672615e+09,Piso,1,1,1,1,1,...,inmobiliaria,,yes,yes,79.0,63.0,no,1.0,En buen estado,2.0
15843,324600.0,40.448403,-3.470899,1.672874e+09,Piso,0,0,0,0,0,...,inmobiliaria,,yes,yes,140.0,,no,4.0,,3.0
15844,249900.0,40.444707,-3.473520,1.672874e+09,Piso,1,1,1,1,1,...,inmobiliaria,20.0,yes,no,106.0,96.0,no,3.0,En buen estado,2.0


In [60]:
df_text = df[['Cocina', 'Cocina equipada', 'Superficie solar', 'Sistema de seguridad', 'Portero automático',
                      'Aire acondicionado', 'Puerta blindada', 'Amueblado', 'Trastero', 'Calefacción', 'Gas',
                      'Tipo de casa', 'Piscina', 'Vidrios dobles', 'Planta', 'Tipo suelo', 'Balcón', 'Orientación',
                      'Armarios empotrados', 'Comedor']]

df_text['concatenated'] = df_text.apply(lambda row: ' '.join([str(val) if pd.notna(val) else '' for val in row]), axis=1)
df_text['concatenated'] = df_text['concatenated'].apply(lambda x: x.strip())
df_text['concatenated'] = df_text['concatenated'].apply(lambda x: ' '.join([y.strip() for y in x.split()]))

texts = df_text['concatenated'].to_list()

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['concatenated'] = df_text.apply(lambda row: ' '.join([str(val) if pd.notna(val) else '' for val in row]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['concatenated'] = df_text['concatenated'].apply(lambda x: x.strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tex

In [61]:
df_text[['concatenated']]

Unnamed: 0,concatenated
0,Amueblada a estrenar Videoportero Comunitaria ...
1,Cocina 2ª
2,
3,Puerta blindada Amueblado 3ª Norte 1
4,Puerta blindada Eléctrica 1ª Oeste
...,...
15841,Sur 1
15842,Aire acondicionado Trastero Piscina Norte 1
15843,Aire acondicionado Trastero Gas natural Comuni...
15844,Portero automático Conducto Puerta blindada Tr...


In [62]:
def get_best_k(X, n_k, model = KMeans()):
    inertias = list()
    
    for k in n_k:
        model.n_clusters = k
        model.fit(X)
        inertias.append(model.inertia_)
        
    k_i = np.array([[k, i] for k, i in enumerate(inertias, start = 1)])

    k_i_max = np.max(k_i, axis=0)
    k_i_norm = k_i / k_i_max
    
    distances = [np.linalg.norm([point, [0,0]]) for point in k_i_norm]
    
    return pd.concat([pd.DataFrame([x for x in range(1,n_k[-1] + 1)], columns = ['K']), pd.DataFrame(distances, columns = ['Distance'])], axis = 1).sort_values('Distance')

best_k = get_best_k(X,range(1,61)).reset_index(drop = True)['K'].iloc[0]

  super()._check_params_vs_input(X, default_n_init=10)


In [63]:
kmeans = KMeans(n_clusters = best_k)
kmeans.fit(X)

  super()._check_params_vs_input(X, default_n_init=10)


In [64]:
df_['cluster'] = kmeans.labels_


In [138]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15846 entries, 0 to 15845
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        15625 non-null  float64
 1   lat          15706 non-null  float64
 2   lng          15706 non-null  float64
 3   updated      15751 non-null  float64
 4   type         15751 non-null  object 
 5   calle        15846 non-null  int64  
 6   bulevar      15846 non-null  int64  
 7   plaza        15846 non-null  int64  
 8   autovia      15846 non-null  int64  
 9   carretera    15846 non-null  int64  
 10  parque       15846 non-null  int64  
 11  paseo        15846 non-null  int64  
 12  avenida      15846 non-null  int64  
 13  publisher    15846 non-null  object 
 14  age          3585 non-null   float64
 15  garage       15846 non-null  object 
 16  lift         15846 non-null  object 
 17  surface      15681 non-null  float64
 18  net_surface  7332 non-null   float64
 19  gard

In [183]:
df = df_.copy()

# type encoding

In [184]:
types = df['type'].unique()
types_dict = {type_ : x for type_, x in zip(types, range(len(types)))}

In [185]:
df['type'] = df['type'].replace(types_dict)

# garage, lift, garden and publisher encoding

In [186]:
df['garage'] = df['garage'].apply(lambda x: 1 if x == 'yes' else 0)
df['lift'] = df['lift'].apply(lambda x: 1 if x == 'yes' else 0)
df['garden'] = df['garden'].apply(lambda x: 1 if x == 'yes' else 0)
df['publisher'] = df['publisher'].apply(lambda x: 1 if x == 'inmobiliaria' else 0)

# condition encoding

In [187]:
conditions = df['condition'].unique()
conditions_dict = {cond_ : x for cond_, x in zip(conditions, range(len(conditions)))}

In [188]:
df['condition'] = df['condition'].replace(conditions_dict)

# TRAINING

In [230]:
from sklearn.impute import KNNImputer

df = df[~df['price'].isna()]

X = df.drop(['price', 'net_surface'], axis = 1)
y = df[['price']]

imputer = KNNImputer(n_neighbors=3)

X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

print(r2_score(y_test, yhat))
print(mean_absolute_error(y_test, yhat))
print(mean_squared_error(y_test, yhat))

  return fit_method(estimator, *args, **kwargs)


0.7998382621501169
126396.04521136002
116066005525.7284


In [231]:
feature_importances = {y : x for x, y in zip(model.feature_importances_, X.columns)}

In [232]:
def extractImportances(data, percentage):
    # Sort the dictionary items by values in descending order
    sorted_items = sorted(data.items(), key=lambda x: x[1], reverse=True)

    cumulative_sum = 0
    selected_keys = []

    for key, value in sorted_items:
        cumulative_sum += value
        selected_keys.append(key)

        if cumulative_sum >= percentage:
            break

    return selected_keys

In [233]:
importances = extractImportances(feature_importances, 0.9)

In [234]:
X = df.drop(['price'], axis = 1)[importances]
y = df[['price']]

imputer = KNNImputer(n_neighbors=3)

X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

print(r2_score(y_test, yhat))
print(mean_absolute_error(y_test, yhat))
print(mean_squared_error(y_test, yhat))

  return fit_method(estimator, *args, **kwargs)


0.8057685012122197
124145.8707503661
112627290578.78326


In [235]:
importances

['bathrooms', 'surface', 'lat', 'lng', 'lift']

In [236]:
importances = extractImportances(feature_importances, 0.8)

In [237]:
X = df.drop(['price'], axis = 1)[importances]
y = df[['price']]

imputer = KNNImputer(n_neighbors=3)

X_imputed = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

print(r2_score(y_test, yhat))
print(mean_absolute_error(y_test, yhat))
print(mean_squared_error(y_test, yhat))

  return fit_method(estimator, *args, **kwargs)


0.8320141772119176
121207.5090751093
97408444018.34808


In [238]:
importances

['bathrooms', 'surface', 'lat', 'lng']