# Random Forest Regressor Model

## Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn import metrics as mt
from sklearn import model_selection as ms
from sklearn.pipeline import Pipeline
from sklearn import ensemble as en

## Dataset

In [6]:
# Dados Treinamento
X_train = pd.read_csv("/home/ds_deivisson/repos/datasets/X_training_regressao.csv")
y_train = pd.read_csv("/home/ds_deivisson/repos/datasets/y_training_regressao.csv").squeeze()

# Dados Validacao
X_val = pd.read_csv("/home/ds_deivisson/repos/datasets/X_validation_regressao.csv")
y_val = pd.read_csv("/home/ds_deivisson/repos/datasets/y_validation_regressao.csv").squeeze()

# Dados Teste
X_test = pd.read_csv("/home/ds_deivisson/repos/datasets/X_test_regressao.csv")
y_test = pd.read_csv("/home/ds_deivisson/repos/datasets/y_test_regressao.csv").squeeze()

## Função de métricas

In [7]:
def regression_metrics(y_true, y_pred, prefix=""):
    return {
        f"R2{prefix}": mt.r2_score(y_true, y_pred),
        f"MSE{prefix}": mt.mean_squared_error(y_true, y_pred),
        f"RMSE{prefix}": np.sqrt(mt.mean_squared_error(y_true, y_pred)),
        f"MAE{prefix}": mt.mean_absolute_error(y_true, y_pred),
        f"MAPE{prefix}": mt.mean_absolute_percentage_error(y_true, y_pred),
    }

## Pipeline do Modelo

In [11]:
pipelines = {
    "rf_regressor": Pipeline([   
        ("model", en.RandomForestRegressor(random_state=42, n_jobs=-1))
    ])
}

## Grid de hiperparâmetros (fine tuning)

In [12]:
param_grids = {
    "rf_regressor": {
        "model__n_estimators": [100, 200, 300],
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_leaf": [1, 5, 10, 20],
        "model__max_features": ["sqrt", "log2", 0.7]
    }
} 

## Tuning com validação (GridSearch)

In [13]:
best_models = {}

for name, pipe in pipelines.items():
    print(f"\nTuning model: {name}")

    gs = ms.GridSearchCV(
        estimator=pipe,
        param_grid=param_grids[name],
        scoring="neg_root_mean_squared_error",
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    gs.fit(X_train, y_train)

    best_models[name] = gs.best_estimator_

    print("Best params:", gs.best_params_)


Tuning model: rf_regressor
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best params: {'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__n_estimators': 300}


In [None]:
results = pd.DataFrame(gs.cv_results_)

results[[
    "param_model__max_depth",
    "mean_test_score",
    "std_test_score"
]].sort_values("mean_test_score", ascending=False).head(10)

Unnamed: 0,param_model__max_depth,mean_test_score,std_test_score
3,5,-21.051599,0.233111
2,4,-21.108964,0.245201
4,6,-21.156823,0.289928
5,7,-21.25971,0.286432
1,3,-21.264923,0.257457
0,2,-21.419304,0.279321
6,8,-21.563622,0.261652
7,9,-21.726489,0.359703
8,10,-22.147473,0.398848
9,11,-22.559696,0.372281


## Avaliação — Treino e Validação

In [14]:
results = {}

for name, model in best_models.items():
    print(f"\nModel: {name}")

    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    metrics_train = regression_metrics(y_train, y_train_pred, "_train")
    metrics_val = regression_metrics(y_val, y_val_pred, "_val")

    results[name] = {**metrics_train, **metrics_val}

    for k, v in results[name].items():
        print(f"{k}: {v:.4f}")


Model: rf_regressor
R2_train: 0.9066
MSE_train: 44.6614
RMSE_train: 6.6829
MAE_train: 4.7893
MAPE_train: 2.6225
R2_val: 0.3463
MSE_val: 312.1678
RMSE_val: 17.6683
MAE_val: 12.8822
MAPE_val: 7.0244


## Treinamento final (Train + Val)

In [15]:
X_final = pd.concat([X_train, X_val])
y_final = np.concatenate([y_train, y_val])

In [16]:
final_models = {}

for name, model in best_models.items():
    model.fit(X_final, y_final)
    final_models[name] = model

## Avaliação final — Teste

In [17]:
test_results = {}

for name, model in final_models.items():
    print(f"\nFinal Test - Model: {name}")

    y_test_pred = model.predict(X_test)
    metrics_test = regression_metrics(y_test, y_test_pred, "_test")

    test_results[name] = metrics_test

    for k, v in metrics_test.items():
        print(f"{k}: {v:.4f}")


Final Test - Model: rf_regressor
R2_test: 0.4070
MSE_test: 288.7324
RMSE_test: 16.9921
MAE_test: 12.1810
MAPE_test: 6.4365
