# Hyperparameter Tuning for Classification and Regression

Optimize model hyperparameters to improve performance on 2010-2021 split

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score, f1_score, r2_score
from scipy.stats import spearmanr
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

## Load Data

In [None]:
# Load temporal split
X_train = pd.read_pickle('../data/features/X_train_temporal.pkl')
X_test = pd.read_pickle('../data/features/X_test_temporal.pkl')
y_train_cls = pd.read_pickle('../data/features/y_train_cls_temporal.pkl')
y_test_cls = pd.read_pickle('../data/features/y_test_cls_temporal.pkl')
y_train_reg = pd.read_pickle('../data/features/y_train_reg_temporal.pkl')
y_test_reg = pd.read_pickle('../data/features/y_test_reg_temporal.pkl')

print(f"Train: {X_train.shape}")
print(f"Test: {X_test.shape}")

## 1. Classification Hyperparameter Tuning

### LightGBM Classification

In [None]:
print("Tuning LightGBM Classifier...")

# Hyperparameter search space
lgb_params = {
    'num_leaves': [20, 31, 40, 50],
    'max_depth': [5, 7, 10, 15, -1],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'min_child_samples': [10, 20, 30, 50],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0]
}

lgb_clf = lgb.LGBMClassifier(random_state=42, verbose=-1)

# Randomized search (faster than grid search)
lgb_search = RandomizedSearchCV(
    lgb_clf,
    lgb_params,
    n_iter=50,  # Try 50 random combinations
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

lgb_search.fit(X_train, y_train_cls)

print(f"\nBest CV ROC-AUC: {lgb_search.best_score_:.4f}")
print(f"\nBest parameters:")
for param, value in lgb_search.best_params_.items():
    print(f"  {param}: {value}")

In [None]:
# Test performance
y_pred_lgb = lgb_search.best_estimator_.predict(X_test)
y_pred_proba_lgb = lgb_search.best_estimator_.predict_proba(X_test)[:, 1]

lgb_roc_auc = roc_auc_score(y_test_cls, y_pred_proba_lgb)
lgb_f1 = f1_score(y_test_cls, y_pred_lgb)

print(f"\nLightGBM Test Performance:")
print(f"  ROC-AUC: {lgb_roc_auc:.4f}")
print(f"  F1-Score: {lgb_f1:.4f}")

### XGBoost Classification

In [None]:
print("Tuning XGBoost Classifier...")

xgb_params = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0]
}

xgb_clf = xgb.XGBClassifier(random_state=42, eval_metric='logloss')

xgb_search = RandomizedSearchCV(
    xgb_clf,
    xgb_params,
    n_iter=50,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_search.fit(X_train, y_train_cls)

print(f"\nBest CV ROC-AUC: {xgb_search.best_score_:.4f}")
print(f"\nBest parameters:")
for param, value in xgb_search.best_params_.items():
    print(f"  {param}: {value}")

In [None]:
# Test performance
y_pred_xgb = xgb_search.best_estimator_.predict(X_test)
y_pred_proba_xgb = xgb_search.best_estimator_.predict_proba(X_test)[:, 1]

xgb_roc_auc = roc_auc_score(y_test_cls, y_pred_proba_xgb)
xgb_f1 = f1_score(y_test_cls, y_pred_xgb)

print(f"\nXGBoost Test Performance:")
print(f"  ROC-AUC: {xgb_roc_auc:.4f}")
print(f"  F1-Score: {xgb_f1:.4f}")

## 2. Regression Hyperparameter Tuning

### LightGBM Regression

In [None]:
print("Tuning LightGBM Regressor...")

lgb_reg_params = {
    'num_leaves': [20, 31, 40, 50],
    'max_depth': [5, 7, 10, 15, -1],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'min_child_samples': [10, 20, 30, 50],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0]
}

lgb_reg = lgb.LGBMRegressor(random_state=42, verbose=-1)

lgb_reg_search = RandomizedSearchCV(
    lgb_reg,
    lgb_reg_params,
    n_iter=50,
    scoring='r2',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

lgb_reg_search.fit(X_train, y_train_reg)

print(f"\nBest CV R²: {lgb_reg_search.best_score_:.4f}")
print(f"\nBest parameters:")
for param, value in lgb_reg_search.best_params_.items():
    print(f"  {param}: {value}")

In [None]:
# Test performance
y_pred_lgb_reg = lgb_reg_search.best_estimator_.predict(X_test)

lgb_r2 = r2_score(y_test_reg, y_pred_lgb_reg)
lgb_spearman = spearmanr(y_test_reg, y_pred_lgb_reg)[0]

print(f"\nLightGBM Regression Test Performance:")
print(f"  R²: {lgb_r2:.4f}")
print(f"  Spearman: {lgb_spearman:.4f}")

### XGBoost Regression

In [None]:
print("Tuning XGBoost Regressor...")

xgb_reg_params = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'min_child_weight': [1, 3, 5, 7],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.5],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0, 0.1, 0.5, 1.0]
}

xgb_reg = xgb.XGBRegressor(random_state=42)

xgb_reg_search = RandomizedSearchCV(
    xgb_reg,
    xgb_reg_params,
    n_iter=50,
    scoring='r2',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

xgb_reg_search.fit(X_train, y_train_reg)

print(f"\nBest CV R²: {xgb_reg_search.best_score_:.4f}")
print(f"\nBest parameters:")
for param, value in xgb_reg_search.best_params_.items():
    print(f"  {param}: {value}")

In [None]:
# Test performance
y_pred_xgb_reg = xgb_reg_search.best_estimator_.predict(X_test)

xgb_r2 = r2_score(y_test_reg, y_pred_xgb_reg)
xgb_spearman = spearmanr(y_test_reg, y_pred_xgb_reg)[0]

print(f"\nXGBoost Regression Test Performance:")
print(f"  R²: {xgb_r2:.4f}")
print(f"  Spearman: {xgb_spearman:.4f}")

## 3. Save Best Models

In [None]:
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save classification models
with open(models_dir / 'lgbm_classifier_tuned.pkl', 'wb') as f:
    pickle.dump(lgb_search.best_estimator_, f)

with open(models_dir / 'xgb_classifier_tuned.pkl', 'wb') as f:
    pickle.dump(xgb_search.best_estimator_, f)

# Save regression models
with open(models_dir / 'lgbm_regressor_tuned.pkl', 'wb') as f:
    pickle.dump(lgb_reg_search.best_estimator_, f)

with open(models_dir / 'xgb_regressor_tuned.pkl', 'wb') as f:
    pickle.dump(xgb_reg_search.best_estimator_, f)

print("✓ All tuned models saved!")

## 4. Summary

In [None]:
print("="*60)
print("HYPERPARAMETER TUNING RESULTS")
print("="*60)

print("\nCLASSIFICATION (Test Set):")
print(f"  LightGBM - ROC-AUC: {lgb_roc_auc:.4f}, F1: {lgb_f1:.4f}")
print(f"  XGBoost  - ROC-AUC: {xgb_roc_auc:.4f}, F1: {xgb_f1:.4f}")

print("\nREGRESSION (Test Set):")
print(f"  LightGBM - R²: {lgb_r2:.4f}, Spearman: {lgb_spearman:.4f}")
print(f"  XGBoost  - R²: {xgb_r2:.4f}, Spearman: {xgb_spearman:.4f}")

# Compare to baseline (before tuning)
print("\n" + "="*60)
print("COMPARISON TO BASELINE (Before Tuning)")
print("="*60)
print("\nClassification:")
print(f"  Baseline: ROC-AUC=78.74%, F1=56.55%")
print(f"  Best tuned: ROC-AUC={max(lgb_roc_auc, xgb_roc_auc)*100:.2f}%, F1={max(lgb_f1, xgb_f1)*100:.2f}%")
print(f"  Improvement: {(max(lgb_roc_auc, xgb_roc_auc) - 0.7874)*100:.2f} points ROC-AUC, {(max(lgb_f1, xgb_f1) - 0.5655)*100:.2f} points F1")

print("\nRegression:")
print(f"  Baseline: R²=24.25%, Spearman=56.70%")
print(f"  Best tuned: R²={max(lgb_r2, xgb_r2)*100:.2f}%, Spearman={max(lgb_spearman, xgb_spearman)*100:.2f}%")
print(f"  Improvement: {(max(lgb_r2, xgb_r2) - 0.2425)*100:.2f} points R², {(max(lgb_spearman, xgb_spearman) - 0.5670)*100:.2f} points Spearman")