# 05 - Model Selection with Cross-Validation (V3)

- RepeatedStratifiedKFold (5×3)
- Candidates: Logistic, RF, GB, XGB, LGBM, CatBoost
- Primary metric: PR-AUC; also track Accuracy, F1, ROC-AUC, Brier
- Save `../v3_artifacts/broad_search_results.csv`


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, roc_auc_score, average_precision_score, brier_score_loss,
)
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

INP = Path('../v3_data/employee_promotion_features.csv')
ART = Path('../v3_artifacts'); ART.mkdir(exist_ok=True)

TARGET = 'Promotion_Eligible'
df = pd.read_csv(INP)
X = df.drop(columns=[TARGET])
y = df[TARGET]

num_cols = X.select_dtypes(include=np.number).columns.tolist()
cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

pre = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

candidates = {
    'logreg': LogisticRegression(max_iter=2000, class_weight='balanced'),
    'rf': RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=0, n_jobs=-1),
    'gb': GradientBoostingClassifier(random_state=0),
    'xgb': XGBClassifier(n_estimators=400, learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8, eval_metric='logloss', random_state=0, n_jobs=-1),
    'lgbm': LGBMClassifier(n_estimators=400, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, random_state=0, n_jobs=-1),
    'catboost': CatBoostClassifier(iterations=400, learning_rate=0.05, depth=6, verbose=False, random_state=0)
}

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)
rows = []
for name, model in candidates.items():
    pipe = Pipeline([('pre', pre), ('model', model)])
    scores = {'accuracy': [], 'f1_macro': [], 'f1_weighted': [], 'rocauc': [], 'prauc': [], 'brier': []}
    for tr, va in cv.split(X, y):
        X_tr, X_va = X.iloc[tr], X.iloc[va]
        y_tr, y_va = y.iloc[tr], y.iloc[va]
        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_va)[:, 1]
        pred = (proba >= 0.5).astype(int)
        scores['accuracy'].append(accuracy_score(y_va, pred))
        scores['f1_macro'].append(f1_score(y_va, pred, average='macro'))
        scores['f1_weighted'].append(f1_score(y_va, pred, average='weighted'))
        scores['rocauc'].append(roc_auc_score(y_va, proba))
        scores['prauc'].append(average_precision_score(y_va, proba))
        scores['brier'].append(brier_score_loss(y_va, proba))
    rows.append({'model': name, **{k: float(np.mean(v)) for k, v in scores.items()}})

res = pd.DataFrame(rows).sort_values('prauc', ascending=False).reset_index(drop=True)
res.to_csv(ART / 'broad_search_results.csv', index=False)
res
