# Packages

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
from matplotlib import pyplot as plt
import pickle


**"bairro_group_type_Manhattan", "longitude", "room_type_type_Entire home/apt" e "room_type_type_Private room"**

# Loading and preparing data for models

In [17]:
df = pd.read_csv("../../assets/teste_indicium_precificacao.csv")

In [18]:
df_pp = df.copy()
df_pp = df_pp[(df_pp['price'] > 0) & (df_pp['price'] <= 230)]
df_pp = pd.get_dummies(df_pp, columns=['bairro_group'], prefix=['bairro_group_type'], dtype=int)
df_pp = pd.get_dummies(df_pp, columns=['room_type'], prefix=['room_type_type'], dtype=int)
df_pp.loc[:, 'ultima_review_timestamp'] = df_pp['ultima_review'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').timestamp() if pd.notnull(x) else None)
df_pp = df_pp.dropna()


In [19]:
X = df_pp.drop("price", axis=1)
# X = df_pp[['longitude', 'room_type_type_Entire home/apt', 'room_type_type_Private room', 'bairro_group_type_Manhattan']]
y = df_pp.price

X_norm = X.select_dtypes(exclude=["object"])
X_norm = X_norm.astype('float')
X_norm = X_norm.iloc[:, 2:]
columns = X_norm.columns
mms = MinMaxScaler()
X_norm = mms.fit_transform(X_norm)
X_norm = pd.DataFrame(X_norm, columns=columns)

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

# Declaring models for gridsearchcv

Como o método de correlação usada foi 'Pearson' e seu método de avaliação leva em consideração a correlação linear entre as características do dataset, também sera usado aqui modelos lineares

## LinearRegression

In [20]:
lin_params = {}

## Ridge

In [21]:
ridge_params = {
    'alpha': [0.01],  # Regularization strength
    'solver': ['saga'],  # Solver to use
    'max_iter': [1000]  # Maximum number of iterations
}

## SGDRegressor

In [22]:
sgd_params = {
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],  # Loss function to be optimized
    'penalty': ['none', 'l2', 'l1', 'elasticnet'],  # Regularization penalty
    'alpha': [0.1],  # Regularization strength
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],  # Learning rate schedule
    'max_iter': [1000],  # Maximum number of iterations
    'tol': [1e-3]  # Tolerance for the stopping criterion
}

# Grid search

In [23]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor


In [24]:
model_params = ([LinearRegression(), Ridge(), SGDRegressor()],
                [lin_params, ridge_params, sgd_params])

In [None]:
list_best_models_params = []
for model, params in zip(model_params[0], model_params[1]):
    gs = GridSearchCV(model,
                      param_grid=params,
                      scoring='neg_root_mean_squared_error',
                      )

    gs.fit(X_train, y_train)
    print(f"Best CV results for {model.__class__.__name__}")
    print("Best Score of train set: " + str(gs.best_score_))
    print("Best estimator: " + str(gs.best_estimator_))
    print("Best parameter set: " + str(gs.best_params_))

    store_best_model_configs = {
        'model_name': model.__class__.__name__,
        'best_score': gs.best_score_,
        'best_estimator': gs.best_estimator_,
        'best_params': gs.best_params_
    }

    list_best_models_params.append(store_best_model_configs)

df_best_models_params = pd.DataFrame(list_best_models_params)
df_best_models_params.to_csv('../../assets/best_models_params_cv.csv', index=False)

df_best_models_params

In [26]:
best_model = Ridge(**{'alpha': 0.01, 'max_iter': 1000, 'solver': 'saga'}).fit(X_train, y_train)

In [27]:
with open('../../assets/supervised_best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
