# PREPROCESSING

In [38]:
import pandas as pd
import numpy as np
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.svm import SVR
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from processing.parsing import *
from processing.encodings import *

In [39]:
df_ = pd.read_parquet('processed_data/tatarabuela.parquet')
df_ = df_[['price', 'lat', 'lng', 'characteristics', 'agency', 'updated', 'timestamp']].dropna()
df_churro = freeChurro(df_)
df = pd.concat([df_.reset_index(drop=True), df_churro], axis = 1)
df.drop('characteristics', axis = 1, inplace = True)

In [40]:
def tryParseM2(obj):
    try:
        if 'm²' in obj:
            return float(obj[:-2])
        return float(''.join(obj.split('.')))
    except:
        return np.nan

# df['Superficie construida'].apply(lambda x : x if pd.isna(x) else tryParseM2(x))

In [41]:
df.Baños = df.Baños.astype(float)
df['Superficie construida'] = df['Superficie construida'].apply(lambda x : x if pd.isna(x) else tryParseM2(x))

df['bathrooms'] = df['Baños']
df['surface'] = df['Superficie construida']

df.drop(['Superficie construida', 'Baños'], axis=1, inplace=True)

In [42]:
df = df[['price', 'lat', 'lng', 'Habitaciones', 'Jardín', 'Gastos de comunidad', 'Antigüedad', 'Superficie útil', 'Ascensor', 'Garaje', 'Conservación', 'agency', 'updated', 'timestamp', 'surface', 'bathrooms']]

In [43]:
len(df.agency.unique())

10676

In [44]:
df['rooms'] = df['Habitaciones']
df = df.drop('Habitaciones', axis = 1)

df['garden'] = df['Jardín']
df = df.drop('Jardín', axis = 1)

df['community_expenses'] = df['Gastos de comunidad']
df = df.drop('Gastos de comunidad', axis = 1)

df['age'] = df['Antigüedad']
df = df.drop('Antigüedad', axis = 1)

df['useful_surface'] = df['Superficie útil']
df = df.drop('Superficie útil', axis = 1)

df['elevator'] = df['Ascensor']
df = df.drop('Ascensor', axis = 1)

df['garage'] = df['Garaje']
df = df.drop('Garaje', axis = 1)

df['state'] = df['Conservación']
df = df.drop('Conservación', axis = 1)

In [45]:
df = df.drop(['agency', 'updated', 'timestamp', 'community_expenses'], axis = 1)

In [46]:
df.rooms = df.rooms.astype(float)

In [47]:
df.garden = df.garden.fillna(False).apply(lambda x: True if x else False)

In [48]:
age_col = df['age'].unique()
ages = {' Menos de 5 años': 0,
                ' Entre 5 y 10 años' : 5,
                ' Entre 10 y 20 años' : 10,
                ' Entre 20 y 30 años' : 20,
                ' Entre 30 y 50 años' : 30,
                ' Más de 50 años': 50,}

In [49]:
df['age'] = df['age'].replace(ages)
df['age'] = df['age'].fillna(df['age'].mean()) # PROBAR IMPUTAR CON KNN

In [50]:
df['useful_surface'] = df['useful_surface'].apply(lambda x : x if pd.isna(x) else tryParseM2(x))

In [51]:
df.elevator = df.elevator.fillna(False).apply(lambda x: True if x else False)

In [52]:
df.garage = df.garage.fillna(False).apply(lambda x: True if x else False)

In [53]:
df.state = df.state.fillna("En buen estado")

In [54]:
df_num = df._get_numeric_data()
df_cat = df.drop(df_num.columns, axis = 1)
df_cat, encodings = targetEncoding(df_cat, df_num['price'], 'target_median')
df = pd.concat([df_num, df_cat], axis = 1)

In [55]:
df_sample = df.sample(round(df.shape[0]*0.15), random_state=42)
df_ = df.reset_index()[~df.reset_index()['index'].isin(df_sample.reset_index()['index'])].drop(['index'], axis=1)

In [56]:
df_ = outliersFilter(df = df_, min_price = 50_000, max_price = 5_000_000, max_baths = 15, max_surface = 1_000)

In [57]:
# df = df_.copy()

# type encoding

In [58]:
# # df_type, encodings_type = targetEncoding(df[['type']], df['price'], condition = 'target_mean')
# df_type, encodings_type = frequencyEncoding(df[['type']])
# df['type'] = df_type['type']

# garage, lift, garden and publisher encoding

In [59]:
# df = binaryEncoding(df)

# condition encoding

In [60]:
# df_condition, encodings_condition = frequencyEncoding(df[['condition']])
# df['condition'] = df_condition['condition']

In [61]:
# df_surface = df[['surface', 'net_surface']].dropna()
# np.corrcoef(df_surface['surface'], df_surface['net_surface'])

In [62]:
df_.dropna().shape[0]/df_.shape[0]

0.6636716520689183

# TRAINING

In [63]:
df_ = df_[~df_['price'].isna()].dropna()

X = df_.drop(['price'], axis = 1)
y = df_[['price']]

In [64]:
imputer = KNNImputer(n_neighbors=3)

# X = imputer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

model = RandomForestRegressor(n_jobs = -1, random_state = 42, n_estimators = 100)

model.fit(X_train, y_train)

yhat = model.predict(X_test)

pd.DataFrame([[r2_score(y_test, yhat),
              mean_absolute_error(y_test, yhat),
              mean_squared_error(y_test, yhat)]],
             columns = ['r2', 'mae', 'mse'])

  return fit_method(estimator, *args, **kwargs)


Unnamed: 0,r2,mae,mse
0,0.737474,81036.039059,35650860000.0


# Validation

In [65]:
# df_sample['type'] = df_sample['type'].replace(encodings_type['type'])

# df_sample = binaryEncoding(df)

# df_sample['condition'] = df_sample['condition'].replace(encodings_condition['condition'])

df_sample['state'].replace(encodings['state'])

df_validation = df_sample.dropna()

In [66]:
df_validation

Unnamed: 0,price,lat,lng,surface,bathrooms,rooms,garden,age,useful_surface,elevator,garage,state
171709,210000.0,41.681706,2.792384,153.0,2.0,3.0,False,25.1211,120.0,True,False,190000.0
274407,200000.0,41.655024,-0.888262,140.0,1.0,4.0,False,25.1211,125.0,True,False,197000.0
109220,139000.0,37.392931,-1.945951,110.0,2.0,3.0,False,25.1211,100.0,True,True,197000.0
11536,84000.0,38.357757,-0.484963,85.0,1.0,2.0,False,25.1211,80.0,False,False,197000.0
51355,350000.0,37.094700,-4.387500,140.0,2.0,8.0,True,20.0000,140.0,False,True,200000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
634,280000.0,38.078408,-0.654891,80.0,2.0,2.0,True,25.1211,65.0,True,False,197000.0
119298,50000.0,43.204895,-5.791440,86.0,1.0,3.0,False,30.0000,86.0,False,False,190000.0
183262,85000.0,37.158049,-3.531664,77.0,1.0,3.0,False,25.1211,67.0,True,False,190000.0
181573,99000.0,37.219500,-3.688100,128.0,1.0,4.0,False,25.1211,105.0,False,False,190000.0


In [67]:
X_val = df_validation.drop(['price'], axis = 1)
y_val = df_validation[['price']]

yhat = model.predict(X_val)

pd.DataFrame([[r2_score(y_val, yhat),
              mean_absolute_error(y_val, yhat),
              mean_squared_error(y_val, yhat)]],
             columns = ['r2', 'mae', 'mse'])

Unnamed: 0,r2,mae,mse
0,0.61052,85603.197519,80600640000.0


In [77]:
print(type(X), X.shape)
print(type(X_val), X_val.shape)
print(type(y), y.shape)
print(type(y_val), y_val.shape)

<class 'pandas.core.frame.DataFrame'> (140711, 11)
<class 'pandas.core.frame.DataFrame'> (26089, 11)
<class 'pandas.core.frame.DataFrame'> (140711, 1)
<class 'pandas.core.frame.DataFrame'> (26089, 1)


In [69]:
import pickle as pkl

with open('./model.pkl', 'bw') as file:
    pkl.dump(model, file)

In [78]:
X = X.to_numpy().astype(np.float32)
y = y.to_numpy().astype(np.float32)
X_val = X_val.to_numpy().astype(np.float32)
y_val = y_val.to_numpy().astype(np.float32)

In [80]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Input, Dense
from keras.metrics import R2Score, MeanSquaredLogarithmicError, MeanSquaredError, MeanAbsoluteError

with tf.device("/GPU:0"):

    model = Sequential()
    
    model.add(Input(shape=(None, 11)))
    model.add(Dense(128*8, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(128*8, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(64*8, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(64*4, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(64*2, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(64, activation='relu', kernel_initializer='he_normal'))
    model.add(Dense(1, kernel_initializer='he_normal'))

    model.compile(optimizer = 'adam', loss = 'mse', metrics = [R2Score(), MeanSquaredLogarithmicError(), MeanAbsoluteError()])

    history = model.fit(X, y, epochs = 50, validation_data=(X_val, y_val), batch_size=64, validation_batch_size=64)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50

KeyboardInterrupt: 

In [None]:
import plotly.express as px
fig = px.line(pd.DataFrame(history.history))

In [None]:
fig