# PREPROCESSING

In [81]:
import pandas as pd
import numpy as np
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from parsing import *

In [82]:
df = pd.read_csv('madrid.csv')
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [83]:
df['price'] = df['price'].map(lambda x: tryInt(x))
df['characteristics'] = df['characteristics'].apply(tryLiteralEval)

In [84]:
data = freeChurro(df)

In [85]:
df_characteristics = pd.DataFrame(data)

In [86]:
df = pd.concat([df, df_characteristics], axis = 1).drop('characteristics', axis = 1)

In [87]:
# Está guardado el DataFrame que tiene ya todas las características como columnas:
# df = pd.read_csv('madrid_parsed.csv')
# df

In [88]:
df_text = df[['Cocina', 'Cocina equipada', 'Superficie solar', 'Sistema de seguridad', 'Portero automático',
                      'Aire acondicionado', 'Puerta blindada', 'Amueblado', 'Trastero', 'Calefacción', 'Gas',
                      'Tipo de casa', 'Piscina', 'Vidrios dobles', 'Planta', 'Tipo suelo', 'Balcón', 'Orientación',
                      'Armarios empotrados', 'Comedor']]

df_text['concatenated'] = df_text.apply(lambda row: ' '.join([str(val) if pd.notna(val) else '' for val in row]), axis=1)
df_text['concatenated'] = df_text['concatenated'].apply(lambda x: x.strip())
df_text['concatenated'] = df_text['concatenated'].apply(lambda x: ' '.join([y.strip() for y in x.split()]))

texts = df_text['concatenated'].to_list()

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['concatenated'] = df_text.apply(lambda row: ' '.join([str(val) if pd.notna(val) else '' for val in row]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_text['concatenated'] = df_text['concatenated'].apply(lambda x: x.strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tex

In [89]:
df_text[['concatenated']]

Unnamed: 0,concatenated
0,Amueblada a estrenar Videoportero Comunitaria ...
1,Cocina 2ª
2,
3,Puerta blindada Amueblado 3ª Norte 1
4,Puerta blindada Eléctrica 1ª Oeste
...,...
15832,Sur 1
15833,Aire acondicionado Trastero Piscina Norte 1
15834,Aire acondicionado Trastero Gas natural Comuni...
15835,Portero automático Conducto Puerta blindada Tr...


In [90]:
def get_best_k(X, n_k, model = KMeans()):
    inertias = list()
    
    for k in n_k:
        model.n_clusters = k
        model.fit(X)
        inertias.append(model.inertia_)
        
    k_i = np.array([[k, i] for k, i in enumerate(inertias, start = 1)])

    k_i_max = np.max(k_i, axis=0)
    k_i_norm = k_i / k_i_max
    
    distances = [np.linalg.norm([point, [0,0]]) for point in k_i_norm]
    
    return pd.concat([pd.DataFrame([x for x in range(1,n_k[-1] + 1)], columns = ['K']), pd.DataFrame(distances, columns = ['Distance'])], axis = 1).sort_values('Distance')

best_k = get_best_k(X,range(1,61)).reset_index(drop = True)['K'].iloc[0]

  super()._check_params_vs_input(X, default_n_init=10)


KeyboardInterrupt: 

In [91]:
kmeans = KMeans(n_clusters = best_k)
kmeans.fit(X)

In [92]:
df['cluster'] = kmeans.labels_


In [93]:
df = df[['price', 'lat', 'lng', 'Baños', 'Habitaciones', 'Superficie construida', 'Conservación', 'cluster']].dropna()


df['Superficie construida'] = df['Superficie construida'].apply(lambda x: int(''.join(x[:-3].split('.'))))
df = pd.concat([df.drop('Conservación', axis = 1), pd.get_dummies(df['Conservación'], drop_first = True)], axis = 1)

# TRAINING

In [94]:
X = df.drop(['price'], axis = 1)
y = df[['price']]

In [95]:
model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [96]:
yhat = model.predict(X_test)

In [97]:
print(r2_score(y_test, yhat))
print(mean_absolute_error(y_test, yhat))
print(mean_squared_error(y_test, yhat))

0.8343201565271363
114925.66048164936
102922396751.57172
