
# Pipeline de Otimização de Hiperparâmetros com Diagnóstico de Overtuning

Este notebook implementa um pipeline completo para otimização de hiperparâmetros (HPO) com validação cruzada e diagnóstico de overtuning/overfitting em modelos de regressão. Utiliza `BayesSearchCV` da biblioteca `scikit-optimize` e múltiplos algoritmos de regressão.

**Objetivos:**
- Monitorar rigorosamente o processo de HPO
- Detectar overtuning/overfitting
- Comparar desempenho de múltiplos algoritmos


In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

import importlib
import json
import warnings
warnings.filterwarnings("ignore")

SEED = 42
CV_FOLDS = 5
SPLIT = 0.2
METRIC_TO_OPT = "r2"


In [2]:

with open("ALGO_CONFIGS2.json", "r") as f:
    ALGO_CONFIGS = json.load(f)


FileNotFoundError: [Errno 2] No such file or directory: 'ALGO_CONFIGS2.json'

In [None]:

def compute_metrics(y_true, y_pred):
    return {
        "r2": r2_score(y_true, y_pred),
        "rmse": mean_squared_error(y_true, y_pred, squared=False),
        "rme": mean_absolute_error(y_true, y_pred)
    }


In [None]:

class OvertuningMonitor:
    def __init__(self, X_dev, y_dev, X_test, y_test, metric):
        self.X_dev = X_dev
        self.y_dev = y_dev
        self.X_test = X_test
        self.y_test = y_test
        self.metric = metric
        self.history = []

    def __call__(self, optim_result):
        best_params = optim_result.best_params_
        model = optim_result.estimator.set_params(**best_params)
        model.fit(self.X_dev, self.y_dev)
        y_pred_dev = model.predict(self.X_dev)
        y_pred_test = model.predict(self.X_test)
        metrics_dev = compute_metrics(self.y_dev, y_pred_dev)
        metrics_test = compute_metrics(self.y_test, y_pred_test)
        self.history.append({
            "params": best_params,
            "dev": metrics_dev,
            "test": metrics_test
        })


In [None]:

from sklearn.datasets import fetch_california_housing

data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=SPLIT, random_state=SEED)


In [None]:

results = {}

for algo_name, config in ALGO_CONFIGS.items():
    print(f"Executando HPO para: {algo_name}")
    model_class = config["model_class"]
    module_name, class_name = model_class.rsplit(".", 1)
    model_cls = getattr(importlib.import_module(module_name), class_name)
    model = model_cls(**config["default_params"])

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])

    search_space = {f"model__{k}": v for k, v in config["search_space"].items()}

    monitor = OvertuningMonitor(X_dev, y_dev, X_test, y_test, METRIC_TO_OPT)

    opt = BayesSearchCV(
        estimator=pipe,
        search_spaces=search_space,
        n_iter=20,
        cv=CV_FOLDS,
        scoring=METRIC_TO_OPT,
        random_state=SEED,
        n_jobs=-1,
        return_train_score=True
    )

    opt.fit(X_dev, y_dev, callback=monitor)
    results[algo_name] = monitor.history


In [None]:

box_data = []
test_scores = {}

for algo, history in results.items():
    fold_scores = [h["dev"][METRIC_TO_OPT] for h in history]
    test_score = history[-1]["test"][METRIC_TO_OPT]
    box_data.append(pd.DataFrame({"score": fold_scores, "algo": algo}))
    test_scores[algo] = test_score

box_df = pd.concat(box_data)

plt.figure(figsize=(14, 6))
sns.boxplot(x="algo", y="score", data=box_df)
sns.scatterplot(x=list(test_scores.keys()), y=list(test_scores.values()), color="red", label="Teste Final", s=100)
plt.title("Comparação de Métricas de Validação Cruzada vs Teste Final")
plt.ylabel(METRIC_TO_OPT.upper())
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
