In [None]:
# Imports principais
import pandas as pd
import numpy as np

# Modelos
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Métricas
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Visualização
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import json

def treinar_e_avaliar(modelo, nome_modelo, parametros, x_train, y_train, x_val, y_val):
    """
    Treina e avalia um modelo com GridSearchCV e salva métricas para cada configuração testada em um arquivo JSON.
    """
    gs = GridSearchCV(modelo, parametros, cv=5, scoring='r2', n_jobs=-1, return_train_score=True)
    gs.fit(x_train, y_train)
    
    # Lista para armazenar os resultados temporariamente
    resultados = []

    # Armazena cada configuração rodada (não apenas o melhor)
    for i, params in enumerate(gs.cv_results_['params']):
        r2_train = gs.cv_results_['mean_train_score'][i]
        r2_val = gs.cv_results_['mean_test_score'][i]
        
        # Após GridSearch, avaliar com predição real na validação (para MAE e RMSE)
        modelo_temp = gs.estimator.set_params(**params)
        modelo_temp.fit(x_train, y_train)
        y_pred_val = modelo_temp.predict(x_val)

        mae_val = mean_absolute_error(y_val, y_pred_val)
        rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))

        resultados.append({
            "Modelo": nome_modelo,
            "Configuracao": params,
            "R2_Train": r2_train,
            "R2_Val": r2_val,
            "MAE_Val": mae_val,
            "RMSE_Val": rmse_val
        })

    # Salva os resultados no arquivo JSON
    with open(f'data/models_results/{nome_modelo}.json', 'w') as f:
        json.dump(resultados, f, indent=4)

    return gs.best_estimator_, gs.best_params_


In [23]:
# chama os df de treino e validação
x_train = pd.read_csv('data/processed/x_train_encoded.csv')
x_val = pd.read_csv('data/processed/x_validation_encoded.csv')
y_train = pd.read_csv('data/processed/y_train.csv').values.ravel()
y_val = pd.read_csv('data/processed/y_validation.csv').values.ravel()

In [15]:
from sklearn.ensemble import RandomForestRegressor

parametros_rf = {
    "n_estimators": [50, 60, 70, 80, 90, 100],
    "max_depth": [None, 5, 10, 15],
    "min_samples_split": [2, 5]
}

best_model_rf, best_params_rf = treinar_e_avaliar(
    RandomForestRegressor(random_state=42), 
    "RandomForest", 
    parametros_rf, 
    x_train, y_train, x_val, y_val
)

print("Melhor Configuração RF:", best_params_rf)


Melhor Configuração RF: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, KFold
from math import sqrt

# Função genérica para avaliação
def avaliar_modelo(modelo, X_val, y_val, nome_modelo):
    y_pred = modelo.predict(X_val)
    r2 = r2_score(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    rmse = sqrt(mean_squared_error(y_val, y_pred))
    return {"Modelo": nome_modelo, "R²": r2, "MAE": mae, "RMSE": rmse}

def treinar_linear_regression():
    model = LinearRegression()
    model.fit(x_train, y_train)
    return avaliar_modelo(model, x_val, y_val, "Linear Regression")

resultados_lr = treinar_linear_regression()
print(resultados_lr)

{'Modelo': 'Linear Regression', 'R²': 0.8571124093419417, 'MAE': 3.2037454471316242, 'RMSE': 3.9197358475976176}


In [33]:
from sklearn.ensemble import GradientBoostingRegressor

parametros_gb = {
    "n_estimators": [100, 150, 200, 250, 300],
    "learning_rate": [0.05, 0.1],
    "max_depth": [3, 5]
}

best_model_gb, best_params_gb = treinar_e_avaliar(
    GradientBoostingRegressor(random_state=42),
    "Gradient Boosting",
    parametros_gb,
    x_train, y_train,
    x_val, y_val
)
print("Melhor Configuração GB:", best_params_gb)

Melhor Configuração GB: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


In [35]:
from sklearn.neural_network import MLPRegressor

parametros_mlp = {
    "hidden_layer_sizes": [(50,), (100,), (50,50), (25,50), (25,15,10)],
    "activation": ["relu", "tanh"],
    "learning_rate_init": [0.001, 0.01]
}

best_model_mlp, best_params_mlp = treinar_e_avaliar(
    MLPRegressor(max_iter=10000, random_state=42),
    "MLP",
    parametros_mlp,
    x_train, y_train,
    x_val, y_val
)
print("Melhor Configuração MLP:", best_params_mlp)

Melhor Configuração MLP: {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.001}


In [34]:
from sklearn.neighbors import KNeighborsRegressor

parametros_knn = {
    "n_neighbors": [3, 5, 7, 9, 10, 12, 15],
    "weights": ["uniform", "distance"]
}

best_model_knn, best_params_knn = treinar_e_avaliar(
    KNeighborsRegressor(),
    "KNN",
    parametros_knn,
    x_train, y_train,
    x_val, y_val
)
print("Melhor Configuração KNN:", best_params_knn)

Melhor Configuração KNN: {'n_neighbors': 12, 'weights': 'distance'}


In [36]:
from lightgbm import LGBMRegressor

parametros_lgbm = {
    "n_estimators": [50, 100, 150, 200, 250, 300],
    "num_leaves": [31, 50, ],
    "learning_rate": [0.001, 0.01, 0.1, 1],
    "max_depth": [-1, 5, 10]
}

best_model_lgbm, best_params_lgbm = treinar_e_avaliar(
    LGBMRegressor(random_state=42),
    "LightGBM",
    parametros_lgbm,
    x_train, y_train,
    x_val, y_val
)

print("Melhor Configuração LGBM:", best_params_lgbm)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003938 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1984
[LightGBM] [Info] Total Bins 1981
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1982
[LightGBM] [Info] Number of data points in the train set: 5200, number of used features: 24
[LightGBM] [Info] Number of data points in the train set: 5200, number of used features: 24
[LightGBM] [Info] Number of data points in the train set: 5200, number of used features: 24
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002527 seconds.
You can set `force_col_wise=true

In [37]:
from xgboost import XGBRegressor

parametros_xgb = {
    "n_estimators": [100, 150, 200, 250, 300],
    "max_depth": [3, 5, 7, 10, 20],
    "learning_rate": [0.001, 0.01, 0.1],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

best_model_xgb, best_params_xgb = treinar_e_avaliar(
    XGBRegressor(random_state=42, n_jobs=-1),
    "XGBoost",
    parametros_xgb,
    x_train, y_train,
    x_val, y_val
)

print("Melhor Configuração XGB:", best_params_xgb)



Melhor Configuração XGB: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.8}
