# Leitura

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error as mae, mean_absolute_percentage_error as mape, mean_squared_error as mse
pd.set_option('display.max_columns', None)

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mse(y_true, y_pred))

In [None]:
df = pd.read_parquet("../data/output/listings.parquet")
df.head(5)

In [None]:
df.query("url == 'https://www.vivareal.com.br/imovel/aluguel-apartamento-2-quartos-tijuca-zona-norte-rio-de-janeiro-rj-64m2-id-2495611515/'")

In [None]:
df.shape

# Exploração

In [None]:
df.isna().sum()

In [None]:
qt = df["total_fee"].quantile([0, .01, .25, .5, .75, .99, 1])
qt

In [None]:
df.total_fee.hist()

In [None]:
np.log(df.total_fee).hist()

# Treino e teste

In [None]:
from sklearn.model_selection import KFold, train_test_split
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import BayesianRidge, ElasticNet, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor

In [None]:
y = df.total_fee.values
X = df.drop(columns = ["title", "description", "media", "street", "streetNumber", "complement", "amenities", "advertiserContact_phones", "whatsappNumber", "price", "condo_fee", "total_fee"])
X = pd.get_dummies(X).fillna(-1)
X.sample(3)

In [None]:
import matplotlib.pyplot as plt

class Model:
    def __init__(self, model):
        self.base_model = model
        
    def fit_predict(self, X, y, n_folds=5, clip=True, prep = lambda x: x, posp = lambda x: x, plot=True):
        y_predict = np.zeros(y.shape[0])

        for train_index, test_index in KFold(n_splits=n_folds).split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            if clip:
                qt = np.quantile(y_train, [.99, 1])
                y_train = np.clip(y_train, 0, qt[0])
            
            model = self.base_model.fit(X_train, prep(y_train))
            y_predict[test_index] = posp(model.predict(X_test))
        
        print({metric.__name__: round(metric(y, y_predict), 4) for metric in [rmse, mse, mae, mape]})
        
        if plot:
            fig, axs = plt.subplots(ncols=3, figsize=(30,5))
            pd.Series(y).hist(ax=axs[0])
            axs[0].set_title('True')
            pd.Series(y_predict).hist(ax=axs[1])
            axs[1].set_title('Predict')        
            pd.Series(y - y_predict).hist(ax=axs[2])
            axs[2].set_title('Error')
        
        return y_predict

In [None]:
for model in [lgb.LGBMRegressor(), xgb.XGBRegressor(), LinearRegression(), GradientBoostingRegressor(), RandomForestRegressor(), ExtraTreesRegressor()]:
    for clip in [True, False]:
        for prep in [True, False]:
            
            print(str(model), "clip: ", clip, "prep: ", prep, end=" | ")
            _ = Model(model).fit_predict(X, y, clip=clip, prep = np.log1p if prep else lambda x: x, posp = np.expm1 if prep else lambda x: x, plot=False)

In [None]:
y_pred = Model(lgb.LGBMRegressor()).fit_predict(X, y, prep=np.log1p, posp=np.expm1, clip=True)

In [None]:
pd.Series(y_pred).hist()

In [None]:
error = (y - y_pred)

In [None]:
error

In [None]:
maior_erro = abs(error) == abs(error).max()

In [None]:
y[maior_erro]

In [None]:
y_pred[maior_erro]

In [None]:
X[maior_erro]