# 06 - Hyperparameter Tuning (V3)

- Tune top models (e.g., GradientBoosting, XGBoost) with Optuna
- Objective: PR-AUC with RepeatedStratifiedKFold (5×3)
- Save best params and CV metrics to `../v3_artifacts/`
- Keep `N_TRIALS` configurable to control runtime


In [None]:
from pathlib import Path
import json
import numpy as np
import optuna
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import average_precision_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

INP = Path('../v3_data/employee_promotion_features.csv')
ART = Path('../v3_artifacts'); ART.mkdir(exist_ok=True)

TARGET = 'Promotion_Eligible'
N_SPLITS, N_REPEATS, SEED = 5, 3, 42
N_TRIALS = 20  # adjust for depth of search

# Data
df = pd.read_csv(INP)
X = df.drop(columns=[TARGET])
y = df[TARGET]

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

pre = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

cv = RepeatedStratifiedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=SEED)

def tune_gb():
    def objective(trial: optuna.Trial):
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 100, 800, step=50),
            'max_depth': trial.suggest_int('max_depth', 2, 8),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'random_state': SEED,
        }
        model = GradientBoostingClassifier(**params)
        pipe = Pipeline([('pre', pre), ('model', model)])
        prauc_scores = []
        for tr, va in cv.split(X, y):
            pipe.fit(X.iloc[tr], y.iloc[tr])
            proba = pipe.predict_proba(X.iloc[va])[:, 1]
            prauc_scores.append(average_precision_score(y.iloc[va], proba))
        return float(np.mean(prauc_scores))

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)
    with open(ART / 'tuning_gb.json', 'w') as f:
        json.dump({'best_value': study.best_value, 'best_params': study.best_params}, f, indent=2)
    return study.best_value, study.best_params


def tune_xgb():
    def objective(trial: optuna.Trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 200, 800, step=50),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
            'max_depth': trial.suggest_int('max_depth', 2, 8),
            'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            'min_child_weight': trial.suggest_float('min_child_weight', 1.0, 20.0),
            'random_state': SEED,
            'eval_metric': 'logloss',
            'n_jobs': -1,
        }
        model = XGBClassifier(**params)
        pipe = Pipeline([('pre', pre), ('model', model)])
        prauc_scores = []
        for tr, va in cv.split(X, y):
            pipe.fit(X.iloc[tr], y.iloc[tr])
            proba = pipe.predict_proba(X.iloc[va])[:, 1]
            prauc_scores.append(average_precision_score(y.iloc[va], proba))
        return float(np.mean(prauc_scores))

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)
    with open(ART / 'tuning_xgb.json', 'w') as f:
        json.dump({'best_value': study.best_value, 'best_params': study.best_params}, f, indent=2)
    return study.best_value, study.best_params

best_gb = tune_gb()
best_xgb = tune_xgb()
print('GB best:', best_gb)
print('XGB best:', best_xgb)
