# 🎯 Model Tuning & Optimization

**Otimização de Hiperparâmetros e Seleção de Modelos**

Competências do Bootcamp Microsoft Data Scientist Azure demonstradas:
- 🔍 **Hyperparameter tuning** com GridSearch e RandomSearch  
- 🤖 **AutoML** comparisons
- ⚡ **Optuna** para otimização avançada
- 📊 **Cross-validation** estratégias
- 🏆 **Model selection** baseado em métricas de negócio

---

**Foco**: Encontrar o melhor modelo para produção no Azure ML


In [None]:
# Imports completos para hyperparameter tuning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    GridSearchCV, RandomizedSearchCV, cross_val_score,
    StratifiedKFold, train_test_split
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import xgboost as xgb
import optuna
from optuna.integration import XGBoostPruningCallback
import time
import warnings
warnings.filterwarnings('ignore')

print("✅ Bibliotecas carregadas!")
print("🎯 Foco: Hyperparameter Tuning para Bootcamp Microsoft Azure")


## 1. 📊 Carregamento e Preparação dos Dados

Vamos usar os dados com features engineered do notebook anterior e preparar para otimização de hiperparâmetros.


In [None]:
# Carregar dados e aplicar feature engineering básica
df = pd.read_csv('../data/credit_risk.csv')

print("🎯 PREPARAÇÃO PARA HYPERPARAMETER TUNING")
print("="*50)
print(f"📊 Dataset: {df.shape[0]:,} amostras, {df.shape[1]} features")
print(f"🎯 Target balance: {df['default'].value_counts().to_dict()}")

# Feature engineering essencial (baseado no notebook anterior)
from sklearn.preprocessing import LabelEncoder

# Features engineered mais importantes
df['loan_to_income_ratio'] = df['loan_amount'] / df['annual_income']
df['credit_score_normalized'] = (df['credit_score'] - 300) / (850 - 300)
df['high_risk_profile'] = ((df['credit_score'] < 600) & (df['debt_to_income'] > 0.5)).astype(int)
df['perfect_customer'] = ((df['credit_score'] > 750) & (df['debt_to_income'] < 0.3) & 
                         (df['employment_length'] > 5) & (df['has_bankruptcy'] == 0)).astype(int)
df['age_income_interaction'] = df['age'] * np.log1p(df['annual_income'])

# Encoding de categóricas
categorical_cols = ['education', 'marital_status', 'home_ownership', 'loan_purpose', 'region']
le_dict = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    le_dict[col] = le

# Preparar X e y
feature_cols = [col for col in df.columns if col not in ['default']]
feature_cols = [col for col in feature_cols if col not in categorical_cols]  # Usar versões encoded

X = df[feature_cols]
y = df['default']

# Train/validation/test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print(f"\n📊 SPLITS:")
print(f"   • Train: {X_train.shape[0]:,} amostras ({y_train.mean():.2%} default)")
print(f"   • Validation: {X_val.shape[0]:,} amostras ({y_val.mean():.2%} default)")  
print(f"   • Test: {X_test.shape[0]:,} amostras ({y_test.mean():.2%} default)")
print(f"   • Features: {X_train.shape[1]}")

print(f"\n✅ Dados preparados para hyperparameter tuning!")


## 2. 🔍 Grid Search - Busca Exaustiva de Hiperparâmetros

Implementação de GridSearchCV para encontrar a melhor combinação de hiperparâmetros para Random Forest.


In [None]:
# Grid Search para Random Forest
print("🔍 GRID SEARCH - RANDOM FOREST")
print("="*50)

# Definir espaço de hiperparâmetros para Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 0.3]
}

print(f"📊 Espaço de busca: {np.prod([len(v) for v in rf_param_grid.values()]):,} combinações")

# Cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid Search
rf_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=rf_param_grid,
    cv=cv_strategy,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print("🚀 Iniciando Grid Search...")
start_time = time.time()

# Fit apenas em subset para demonstração (em produção usaria todos os dados)
sample_size = min(2000, len(X_train))  
sample_idx = np.random.choice(len(X_train), sample_size, replace=False)

rf_grid_search.fit(X_train.iloc[sample_idx], y_train.iloc[sample_idx])

elapsed_time = time.time() - start_time

print(f"⏱️ Grid Search concluído em {elapsed_time:.1f} segundos")
print(f"🏆 Melhor AUC: {rf_grid_search.best_score_:.4f}")
print(f"🎯 Melhores parâmetros:")

for param, value in rf_grid_search.best_params_.items():
    print(f"   • {param}: {value}")

# Avaliar no validation set
best_rf = rf_grid_search.best_estimator_
y_val_pred_rf = best_rf.predict(X_val)
y_val_pred_proba_rf = best_rf.predict_proba(X_val)[:, 1]

val_auc_rf = roc_auc_score(y_val, y_val_pred_proba_rf)
print(f"\n📊 Performance no Validation Set:")
print(f"   • AUC: {val_auc_rf:.4f}")
print(f"   • Accuracy: {(y_val_pred_rf == y_val).mean():.4f}")

# Visualização dos resultados do Grid Search
results_df = pd.DataFrame(rf_grid_search.cv_results_)
top_results = results_df.nlargest(10, 'mean_test_score')[['mean_test_score', 'std_test_score', 'params']]

print(f"\n🏆 TOP 5 COMBINAÇÕES:")
print("-"*60)
for i, (_, row) in enumerate(top_results.head().iterrows(), 1):
    print(f"{i}. AUC: {row['mean_test_score']:.4f} ± {row['std_test_score']:.4f}")
    print(f"   Params: {row['params']}")
    print()

print("✅ Grid Search Random Forest concluído!")
