## Hyperparameter Tuning with Optuna

**Goal:** Optimize LightGBM and XGBoost hyperparameters using Optuna for improved performance.

**Base Features:** Phase 5 final features (95 features)



### Setup & Imports


In [2]:
import pandas as pd
import numpy as np
import json
import os
import warnings
from typing import Dict

import optuna
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_parallel_coordinate
)

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score, classification_report

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

import joblib

np.random.seed(42)
warnings.filterwarnings('ignore')

print("Optuna version:", optuna.__version__)
print("LightGBM imported successfully")
print("XGBoost imported successfully")


Optuna version: 4.5.0
LightGBM imported successfully
XGBoost imported successfully


### Load Phase 5 Final Features

Load the best feature set from Phase 5 (95 features after all feature engineering phases).


In [3]:
X_train = pd.read_csv('../../data/processed/phase5_installments/X_train.csv')
y_train = pd.read_csv('../../data/processed/phase5_installments/y_train.csv')
X_val = pd.read_csv('../../data/processed/phase5_installments/X_val.csv')
y_val = pd.read_csv('../../data/processed/phase5_installments/y_val.csv')
X_test = pd.read_csv('../../data/processed/phase5_installments/X_test.csv')

with open('../../data/processed/phase5_installments/feature_metadata.json', 'r') as f:
    phase5_metadata = json.load(f)

X_train = X_train.drop('SK_ID_CURR', axis=1)
X_val = X_val.drop('SK_ID_CURR', axis=1)
test_ids = X_test['SK_ID_CURR'].copy()
X_test = X_test.drop('SK_ID_CURR', axis=1)

y_train = y_train['TARGET'].values
y_val = y_val['TARGET'].values

print(f"Train shape: {X_train.shape}")
print(f"Val shape: {X_val.shape}")
print(f"Test shape: {X_test.shape}")
print(f"\nPhase 5 Baseline Performance:")
print(f"  LightGBM CV AUC: {phase5_metadata['cv_eval']['lgb_cv_mean']:.4f}")
print(f"  XGBoost CV AUC: {phase5_metadata['cv_eval']['xgb_cv_mean']:.4f}")


Train shape: (246008, 95)
Val shape: (61503, 95)
Test shape: (48744, 95)

Phase 5 Baseline Performance:
  LightGBM CV AUC: 0.7755
  XGBoost CV AUC: 0.7674


### LightGBM Hyperparameter Tuning

In [4]:
def lgb_objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'class_weight': 'balanced',
        'random_state': 42,
        'verbose': -1
    }
    
    model = LGBMClassifier(**params)
    
    X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
    y_full = np.concatenate([y_train, y_val])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_full, y_full, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    return scores.mean()

print("LightGBM objective function defined")


LightGBM objective function defined


In [5]:
%%time

study_lgb = optuna.create_study(
    direction='maximize',
    study_name='lightgbm_tuning',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner()
)

study_lgb.optimize(lgb_objective, n_trials=150, show_progress_bar=True, n_jobs=1)

print(f"\nBest trial:")
print(f"  Value (CV AUC): {study_lgb.best_trial.value:.4f}")
print(f"  Params: ")
for key, value in study_lgb.best_trial.params.items():
    print(f"    {key}: {value}")


[I 2025-11-13 15:01:54,075] A new study created in memory with name: lightgbm_tuning


  0%|          | 0/150 [00:00<?, ?it/s]

[I 2025-11-13 15:02:36,086] Trial 0 finished with value: 0.7235749168651157 and parameters: {'n_estimators': 437, 'learning_rate': 0.2536999076681772, 'max_depth': 10, 'num_leaves': 68, 'min_child_samples': 19, 'subsample': 0.5779972601681014, 'colsample_bytree': 0.5290418060840998, 'reg_alpha': 0.6245760287469893, 'reg_lambda': 0.002570603566117598}. Best is trial 0 with value: 0.7235749168651157.
[I 2025-11-13 15:04:26,405] Trial 1 finished with value: 0.7775540100480551 and parameters: {'n_estimators': 737, 'learning_rate': 0.010725209743171996, 'max_depth': 12, 'num_leaves': 87, 'min_child_samples': 25, 'subsample': 0.5909124836035503, 'colsample_bytree': 0.5917022549267169, 'reg_alpha': 5.472429642032198e-06, 'reg_lambda': 0.00052821153945323}. Best is trial 1 with value: 0.7775540100480551.
[I 2025-11-13 15:05:27,934] Trial 2 finished with value: 0.7767684702749509 and parameters: {'n_estimators': 489, 'learning_rate': 0.02692655251486473, 'max_depth': 9, 'num_leaves': 31, 'min_c

### Train Final LightGBM with Best Params


In [7]:
%%time

best_params_lgb = study_lgb.best_trial.params
best_params_lgb['class_weight'] = 'balanced'
best_params_lgb['random_state'] = 42
best_params_lgb['verbose'] = -1

lgb_tuned = LGBMClassifier(**best_params_lgb)
lgb_tuned.fit(X_train, y_train)

train_auc_lgb = roc_auc_score(y_train, lgb_tuned.predict_proba(X_train)[:, 1])
val_auc_lgb = roc_auc_score(y_val, lgb_tuned.predict_proba(X_val)[:, 1])

print("LightGBM Tuned Results:")
print(f"  Train AUC: {train_auc_lgb:.4f}")
print(f"  Val AUC: {val_auc_lgb:.4f}")
print(f"  Baseline CV AUC: {phase5_metadata['cv_eval']['lgb_cv_mean']:.4f}")
print(f"  Tuned CV AUC: {study_lgb.best_trial.value:.4f}")
print(f"  Improvement: {study_lgb.best_trial.value - phase5_metadata['cv_eval']['lgb_cv_mean']:.4f}")


LightGBM Tuned Results:
  Train AUC: 0.8223
  Val AUC: 0.7780
  Baseline CV AUC: 0.7755
  Tuned CV AUC: 0.7783
  Improvement: 0.0028
CPU times: total: 1min 33s
Wall time: 12.2 s


### XGBoost Hyperparameter Tuning


In [8]:
def xgb_objective(trial):
    scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'scale_pos_weight': scale_pos_weight,
        'random_state': 42,
        'verbosity': 0,
        'eval_metric': 'auc'
    }
    
    model = XGBClassifier(**params)
    
    X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
    y_full = np.concatenate([y_train, y_val])
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_full, y_full, cv=cv, scoring='roc_auc', n_jobs=-1)
    
    return scores.mean()

print("XGBoost objective function defined")


XGBoost objective function defined


In [9]:
%%time

study_xgb = optuna.create_study(
    direction='maximize',
    study_name='xgboost_tuning',
    sampler=optuna.samplers.TPESampler(seed=42),
    pruner=optuna.pruners.MedianPruner()
)

study_xgb.optimize(xgb_objective, n_trials=50, show_progress_bar=True, n_jobs=1)

print(f"\nBest trial:")
print(f"  Value (CV AUC): {study_xgb.best_trial.value:.4f}")
print(f"  Params: ")
for key, value in study_xgb.best_trial.params.items():
    print(f"    {key}: {value}")


[I 2025-11-13 17:47:34,018] A new study created in memory with name: xgboost_tuning


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-13 17:49:24,974] Trial 0 finished with value: 0.7062873979233851 and parameters: {'n_estimators': 437, 'learning_rate': 0.2536999076681772, 'max_depth': 10, 'min_child_weight': 6, 'subsample': 0.5780093202212182, 'colsample_bytree': 0.5779972601681014, 'gamma': 2.9152036385288193e-08, 'reg_alpha': 0.6245760287469893, 'reg_lambda': 0.002570603566117598}. Best is trial 0 with value: 0.7062873979233851.
[I 2025-11-13 17:55:02,104] Trial 1 finished with value: 0.7634962304109589 and parameters: {'n_estimators': 737, 'learning_rate': 0.010725209743171996, 'max_depth': 12, 'min_child_weight': 9, 'subsample': 0.6061695553391381, 'colsample_bytree': 0.5909124836035503, 'gamma': 2.9324868872723725e-07, 'reg_alpha': 5.472429642032198e-06, 'reg_lambda': 0.00052821153945323}. Best is trial 1 with value: 0.7634962304109589.
[I 2025-11-13 17:56:58,171] Trial 2 finished with value: 0.7633788426562494 and parameters: {'n_estimators': 489, 'learning_rate': 0.02692655251486473, 'max_depth': 9

### Train Final XGBoost with Best Params


In [10]:
%%time

scale_pos_weight_xgb = (y_train == 0).sum() / (y_train == 1).sum()

best_params_xgb = study_xgb.best_trial.params
best_params_xgb['scale_pos_weight'] = scale_pos_weight_xgb
best_params_xgb['random_state'] = 42
best_params_xgb['verbosity'] = 0
best_params_xgb['eval_metric'] = 'auc'

xgb_tuned = XGBClassifier(**best_params_xgb)
xgb_tuned.fit(X_train, y_train)

train_auc_xgb = roc_auc_score(y_train, xgb_tuned.predict_proba(X_train)[:, 1])
val_auc_xgb = roc_auc_score(y_val, xgb_tuned.predict_proba(X_val)[:, 1])

print("XGBoost Tuned Results:")
print(f"  Train AUC: {train_auc_xgb:.4f}")
print(f"  Val AUC: {val_auc_xgb:.4f}")
print(f"  Baseline CV AUC: {phase5_metadata['cv_eval']['xgb_cv_mean']:.4f}")
print(f"  Tuned CV AUC: {study_xgb.best_trial.value:.4f}")
print(f"  Improvement: {study_xgb.best_trial.value - phase5_metadata['cv_eval']['xgb_cv_mean']:.4f}")


XGBoost Tuned Results:
  Train AUC: 0.8157
  Val AUC: 0.7781
  Baseline CV AUC: 0.7674
  Tuned CV AUC: 0.7781
  Improvement: 0.0107
CPU times: total: 2min 46s
Wall time: 12.5 s


### Model Comparison


In [11]:
comparison_df = pd.DataFrame({
    'Model': [
        'LightGBM Baseline',
        'LightGBM Tuned',
        'XGBoost Baseline',
        'XGBoost Tuned'
    ],
    'CV AUC': [
        phase5_metadata['cv_eval']['lgb_cv_mean'],
        study_lgb.best_trial.value,
        phase5_metadata['cv_eval']['xgb_cv_mean'],
        study_xgb.best_trial.value
    ],
    'Val AUC': [
        phase5_metadata['quick_eval']['lgb_val_auc'],
        val_auc_lgb,
        phase5_metadata['quick_eval']['xgb_val_auc'],
        val_auc_xgb
    ],
    'Train AUC': [
        phase5_metadata['quick_eval']['lgb_train_auc'],
        train_auc_lgb,
        phase5_metadata['quick_eval']['xgb_train_auc'],
        train_auc_xgb
    ]
})

comparison_df['CV Improvement'] = comparison_df['CV AUC'] - [
    phase5_metadata['cv_eval']['lgb_cv_mean'],
    phase5_metadata['cv_eval']['lgb_cv_mean'],
    phase5_metadata['cv_eval']['xgb_cv_mean'],
    phase5_metadata['cv_eval']['xgb_cv_mean']
]

print("\nModel Performance Comparison:")
print(comparison_df.to_string(index=False))



Model Performance Comparison:
            Model   CV AUC  Val AUC  Train AUC  CV Improvement
LightGBM Baseline 0.775505 0.776472   0.862449        0.000000
   LightGBM Tuned 0.778320 0.777957   0.822262        0.002815
 XGBoost Baseline 0.767399 0.767688   0.929613        0.000000
    XGBoost Tuned 0.778145 0.778131   0.815673        0.010746


### Kaggle Submission

In [12]:
lgb_preds = lgb_tuned.predict_proba(X_test)[:, 1]
xgb_preds = xgb_tuned.predict_proba(X_test)[:, 1]

submission_lgb = pd.DataFrame({
    'SK_ID_CURR': test_ids,
    'TARGET': lgb_preds
})

submission_xgb = pd.DataFrame({
    'SK_ID_CURR': test_ids,
    'TARGET': xgb_preds
})

os.makedirs('../../data/submissions', exist_ok=True)

submission_lgb.to_csv('../../data/submissions/tuned_lightgbm_v1.csv', index=False)
submission_xgb.to_csv('../../data/submissions/tuned_xgboost_v1.csv', index=False)

print("Submission files created:")
print("  tuned_lightgbm_v1.csv")
print("  tuned_xgboost_v1.csv")
print(f"\nLightGBM predictions - Min: {lgb_preds.min():.4f}, Max: {lgb_preds.max():.4f}, Mean: {lgb_preds.mean():.4f}")
print(f"XGBoost predictions  - Min: {xgb_preds.min():.4f}, Max: {xgb_preds.max():.4f}, Mean: {xgb_preds.mean():.4f}")


Submission files created:
  tuned_lightgbm_v1.csv
  tuned_xgboost_v1.csv

LightGBM predictions - Min: 0.0035, Max: 0.9777, Mean: 0.3703
XGBoost predictions  - Min: 0.0057, Max: 0.9758, Mean: 0.3723


### MLflow Tracking

In [13]:
mlflow_tracking_uri = os.path.join(os.getcwd(), 'mlruns')
mlflow.set_tracking_uri(f"file:///{mlflow_tracking_uri}")

mlflow.set_experiment("model_optimization")

with mlflow.start_run(run_name="lightgbm_tuned"):
    mlflow.log_params(best_params_lgb)
    mlflow.log_param("n_features", X_train.shape[1])
    mlflow.log_param("n_trials", 150)
    
    mlflow.log_metric("train_auc", train_auc_lgb)
    mlflow.log_metric("val_auc", val_auc_lgb)
    mlflow.log_metric("cv_auc", study_lgb.best_trial.value)
    mlflow.log_metric("baseline_cv_auc", phase5_metadata['cv_eval']['lgb_cv_mean'])
    mlflow.log_metric("improvement", study_lgb.best_trial.value - phase5_metadata['cv_eval']['lgb_cv_mean'])
    
    X_sample = X_train.iloc[:5]
    y_sample = y_train[:5]
    signature = infer_signature(X_sample, y_sample)
    mlflow.sklearn.log_model(lgb_tuned, "model", signature=signature, input_example=X_sample)

print("Logged LightGBM tuned model to MLflow")

with mlflow.start_run(run_name="xgboost_tuned"):
    mlflow.log_params(best_params_xgb)
    mlflow.log_param("n_features", X_train.shape[1])
    mlflow.log_param("n_trials", 150)
    
    mlflow.log_metric("train_auc", train_auc_xgb)
    mlflow.log_metric("val_auc", val_auc_xgb)
    mlflow.log_metric("cv_auc", study_xgb.best_trial.value)
    mlflow.log_metric("baseline_cv_auc", phase5_metadata['cv_eval']['xgb_cv_mean'])
    mlflow.log_metric("improvement", study_xgb.best_trial.value - phase5_metadata['cv_eval']['xgb_cv_mean'])
    
    X_sample = X_train.iloc[:5]
    y_sample = y_train[:5]
    signature = infer_signature(X_sample, y_sample)
    mlflow.sklearn.log_model(xgb_tuned, "model", signature=signature, input_example=X_sample)

print("Logged XGBoost tuned model to MLflow")


2025/11/13 19:32:50 INFO mlflow.tracking.fluent: Experiment with name 'model_optimization' does not exist. Creating a new experiment.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]



Logged LightGBM tuned model to MLflow


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Logged XGBoost tuned model to MLflow


### Save Models & Artifacts

In [14]:
os.makedirs('../../models/tuned', exist_ok=True)

joblib.dump(lgb_tuned, '../../models/tuned/lightgbm_tuned_v1.pkl')
joblib.dump(xgb_tuned, '../../models/tuned/xgboost_tuned_v1.pkl')

with open('../../models/tuned/best_params_lgb.json', 'w') as f:
    json.dump(best_params_lgb, f, indent=2)

with open('../../models/tuned/best_params_xgb.json', 'w') as f:
    json.dump(best_params_xgb, f, indent=2)

joblib.dump(study_lgb, '../../models/tuned/optuna_study_lgb.pkl')
joblib.dump(study_xgb, '../../models/tuned/optuna_study_xgb.pkl')

tuning_metadata = {
    'lightgbm': {
        'best_params': best_params_lgb,
        'cv_auc': study_lgb.best_trial.value,
        'val_auc': val_auc_lgb,
        'train_auc': train_auc_lgb,
        'baseline_cv_auc': phase5_metadata['cv_eval']['lgb_cv_mean'],
        'improvement': study_lgb.best_trial.value - phase5_metadata['cv_eval']['lgb_cv_mean'],
        'n_trials': 150
    },
    'xgboost': {
        'best_params': best_params_xgb,
        'cv_auc': study_xgb.best_trial.value,
        'val_auc': val_auc_xgb,
        'train_auc': train_auc_xgb,
        'baseline_cv_auc': phase5_metadata['cv_eval']['xgb_cv_mean'],
        'improvement': study_xgb.best_trial.value - phase5_metadata['cv_eval']['xgb_cv_mean'],
        'n_trials': 150
    }
}

with open('../../models/tuned/tuning_metadata.json', 'w') as f:
    json.dump(tuning_metadata, f, indent=2)

print("Saved artifacts:")
print("  Models: lightgbm_tuned_v1.pkl, xgboost_tuned_v1.pkl")
print("  Best params: best_params_lgb.json, best_params_xgb.json")
print("  Optuna studies: optuna_study_lgb.pkl, optuna_study_xgb.pkl")
print("  Metadata: tuning_metadata.json")


Saved artifacts:
  Models: lightgbm_tuned_v1.pkl, xgboost_tuned_v1.pkl
  Best params: best_params_lgb.json, best_params_xgb.json
  Optuna studies: optuna_study_lgb.pkl, optuna_study_xgb.pkl
  Metadata: tuning_metadata.json
