<a href="https://colab.research.google.com/github/daycardoso/PredictCost/blob/main/PredicCostRandomForestRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
# Carregar o datset unificado
df_unificado = pd.read_csv('/content/drive/MyDrive/Trabalho ML Mestrado 01-2025/df_unificado.csv')

# matriz contendo os atributos
X = df_unificado.iloc[:, :-1].values

# vetor contendo o custo, ou seja, a ultima coluna
y = df_unificado.iloc[:, -1].values

# nome de cada atributo
feature_names = df_unificado.columns[:-1]

# nome de cada classe
target_names = df_unificado.columns[-1]

print(f"Dimensões de X: {X.shape}\n")
print(f"Dimensões de y: {y.shape}\n")
print(f"Nomes dos atributos: {feature_names}\n")
print(f"Nomes das classes: {target_names}")

Dimensões de X: (6336328, 8)

Dimensões de y: (6336328,)

Nomes dos atributos: Index(['x0', 'y0', 'theta0', 'beta0', 'xf', 'yf', 'thetaf', 'betaf'], dtype='object')

Nomes das classes: cost


In [3]:
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_validate

# 1) Cria um hold-out antes de qualquer CV
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

pipeline = Pipeline([
    ('regressor', RandomForestRegressor(random_state=42,
                                        criterion='absolute_error',
                                        max_depth=20,
                                        n_jobs=-1))
    ])


In [5]:
from sklearn.model_selection import RepeatedKFold

# 5×5 CV repetida: balanceia viés x variância na estimação
cv = RepeatedKFold(n_splits=5, n_repeats=5, random_state=42)


In [None]:
from sklearn.model_selection import cross_validate

scoring = {
    'R2': 'r2',
    'MSE': 'neg_mean_squared_error',
    'MAE': 'neg_mean_absolute_error',
    'MAPE': 'neg_mean_absolute_percentage_error',
    'MedAE': 'neg_median_absolute_error',
    'MaxE': 'max_error',
    'EVS': 'explained_variance',
}

cv_results = cross_validate(
    pipeline, X_train_full, y_train_full,
    cv=cv, scoring=scoring, return_train_score=True, n_jobs=-1
)

In [None]:
# Treinar o modelo
# pipeline.fit(X_train_full, y_train_full)
modelo = pipeline.fit(X_train_full, y_train_full)

In [None]:
# 5) Gera predições no hold-out
y_pred = pipeline.predict(X_test)

In [None]:

# 6) Salva TUDO num dict
full_results = {
    'pipeline': pipeline,
    'X_test':   X_test,
    'y_test':   y_test,
    'y_pred':   y_pred,
    'cv_results': cv_results,
    'feature_names': feature_names
}
joblib.dump(full_results, '/content/drive/MyDrive/Trabalho ML Mestrado 01-2025/randon_forest_regressor_full_results.pkl')

In [None]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    mean_absolute_error,
    mean_absolute_percentage_error,
    median_absolute_error,
    max_error,
    explained_variance_score
)
from IPython.display import display

# Carregar resultados
res = joblib.load('/content/drive/MyDrive/Trabalho ML Mestrado 01-2025/randon_forest_regressor_full_results.pkl')
model         = res['pipeline']
X_test        = res['X_test']
y_true        = res['y_test']
y_pred        = res.get('y_pred', model.predict(X_test))
feature_names = res.get('feature_names', getattr(X_test, 'columns', None))

# 1) Exibir métricas em tabela
metrics = {
    'R² Score'           : r2_score(y_true, y_pred),
    'MSE'                 : mean_squared_error(y_true, y_pred),
    'MAE'                 : mean_absolute_error(y_true, y_pred),
    'MAPE'                : mean_absolute_percentage_error(y_true, y_pred),
    'Median AE'           : median_absolute_error(y_true, y_pred),
    'Max Error'           : max_error(y_true, y_pred),
    'Explained Variance'  : explained_variance_score(y_true, y_pred)
}
df_metrics = pd.DataFrame.from_dict(metrics, orient='index', columns=['Valor']).round(4)
display(df_metrics)

# 2) Scatter Real vs Predito
plt.figure()
plt.scatter(y_true, y_pred)
min_val, max_val = np.min([y_true.min(), y_pred.min()]), np.max([y_true.max(), y_pred.max()])
plt.plot([min_val, max_val], [min_val, max_val])
plt.xlabel('Valor Real')
plt.ylabel('Valor Predito')
plt.title('Real vs Predito')
plt.show()

# 3) Histograma de resíduos
residuals = y_true - y_pred
plt.figure()
plt.hist(residuals, bins=50)
plt.xlabel('Resíduo (Real - Predito)')
plt.title('Histograma de Resíduos')
plt.show()

# 4) Boxplot de resíduos por quartil
df_r = pd.DataFrame({'Real': y_true, 'Resíduo': residuals})
df_r['Quartil'] = pd.qcut(df_r['Real'], 4, labels=[1, 2, 3, 4])
groups = [df_r[df_r['Quartil'] == q]['Resíduo'] for q in sorted(df_r['Quartil'].unique())]
plt.figure()
plt.boxplot(groups, labels=sorted(df_r['Quartil'].unique()))
plt.xlabel('Quartil de Valor Real')
plt.ylabel('Resíduo')
plt.title('Boxplot de Resíduos por Quartil')
plt.show()

# 5) Importância das features (se aplicável)
if hasattr(model, 'feature_importances_') and feature_names is not None:
    fi = pd.Series(model.feature_importances_, index=feature_names).sort_values(ascending=False)
    plt.figure(figsize=(8, 4))
    fi.plot(kind='bar')
    plt.ylabel('Importância')
    plt.title('Importância das Features')
    plt.tight_layout()
    plt.show()