# Advanced ML Models with Optuna Tuning
Yuvraj Grover

This notebook implements and tunes advanced ML models using Optuna:
- XGBoost
- LightGBM
- CatBoost
- MLP (optional revisit)

Each model will be trained and tuned using Optuna and evaluated on the diabetic dataset.

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings('ignore')

In [26]:
# Load preprocessed data
df = pd.read_pickle('../../dataPreprocessing/medical_data.pkl')
df.head()


Unnamed: 0,encounter_id,patient_nbr,gender,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,mb_readmitted_no_ct,mb_num_lab_procedures_ct,mb_num_procedures_ct,mb_num_medications_ct,mb_number_outpatient_ct,mb_number_emergency_ct,mb_number_inpatient_ct,mb_number_diagnoses_ct,age_encoded,dummy
2,64410,86047875,Female,2,11,5,13,2,0,1,...,1,11,5,13,2,0,1,6,1.0,1
3,500364,82442376,Male,2,44,1,16,0,0,0,...,1,44,1,16,0,0,0,7,2.0,1
4,16680,42519267,Male,1,51,0,8,0,0,0,...,1,51,0,8,0,0,0,5,3.0,1
5,35754,82637451,Male,3,31,6,16,0,0,0,...,0,31,6,16,0,0,0,9,4.0,1
6,55842,84259809,Male,4,70,1,21,0,0,0,...,1,70,1,21,0,0,0,7,5.0,1


In [29]:
# Define features and target
# Drop Gurmat's excluded columns + target column
# Drop known leaky features
# List of leaky features to drop
leaky_features = [
    'mb_readmitted_gt30_ct', 'mb_readmitted_no_ct', 'mb_readmitted_lt30_ct',
    'distinct_diag_count', 'encounter_ct', 'mb_number_diagnoses_ct',
    'mb_num_lab_procedures_ct', 'mb_num_medications_ct', 'mb_time_in_hospital',
    'mb_admission_grp_1_ct', 'mb_discharge_grp_1_ct', 'mb_number_inpatient_ct'
]

X = df.drop(columns=leaky_features + ['readmitted', 'readmitted_ind'], errors='ignore')
y = df['readmitted_ind']




X_encoded = X.copy()
for col in X_encoded.select_dtypes(include='object').columns:
    X_encoded[col] = X_encoded[col].astype('category')
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

## 🔍 Model 1: XGBoost with Optuna

In [30]:
def objective_xgb(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'enable_categorical': True,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0)
    }

    model = xgb.XGBClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

# Run Optuna Study
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=30)

# Final model with best parameters
best_params_xgb = study_xgb.best_params
best_params_xgb.update({
    'use_label_encoder': False,
    'eval_metric': 'logloss',
    'enable_categorical': True
})

final_xgb = xgb.XGBClassifier(**best_params_xgb)
final_xgb.fit(X_train, y_train)
preds_xgb = final_xgb.predict(X_test)

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, preds_xgb))
print("\nClassification Report:")
print(classification_report(y_test, preds_xgb))


[I 2025-07-01 16:58:35,439] A new study created in memory with name: no-name-79cb9b04-c5f3-46ed-b0bf-b2f9632c0ef3
[I 2025-07-01 16:58:36,656] Trial 0 finished with value: 0.7465086483023703 and parameters: {'max_depth': 7, 'learning_rate': 0.21336808907994123, 'n_estimators': 140, 'subsample': 0.7789044907363574, 'colsample_bytree': 0.9901488965167514}. Best is trial 0 with value: 0.7465086483023703.
[I 2025-07-01 16:58:38,198] Trial 1 finished with value: 0.7463164638052531 and parameters: {'max_depth': 3, 'learning_rate': 0.07714809170962164, 'n_estimators': 446, 'subsample': 0.685904276505674, 'colsample_bytree': 0.6579076934556752}. Best is trial 0 with value: 0.7465086483023703.
[I 2025-07-01 16:58:44,169] Trial 2 finished with value: 0.7560538116591928 and parameters: {'max_depth': 8, 'learning_rate': 0.022197446502952942, 'n_estimators': 668, 'subsample': 0.650613659898418, 'colsample_bytree': 0.6491621496699289}. Best is trial 2 with value: 0.7560538116591928.
[I 2025-07-01 16:


Confusion Matrix:
[[6978 1484]
 [2268 4880]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.79      8462
           1       0.77      0.68      0.72      7148

    accuracy                           0.76     15610
   macro avg       0.76      0.75      0.76     15610
weighted avg       0.76      0.76      0.76     15610



## 🔍 Model 2: LightGBM with Optuna

In [31]:
# Code for Optuna tuning of LightGBM goes here
import lightgbm as lgb
# Objective function for Optuna
def objective_lgb(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7)
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

# Run the Optuna study
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=30)

# Train final model using best parameters
print("Best Trial:")
print(study_lgb.best_trial)

best_params_lgb = study_lgb.best_params
final_lgb = lgb.LGBMClassifier(**best_params_lgb)
final_lgb.fit(X_train, y_train)
preds_lgb = final_lgb.predict(X_test)

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, preds_lgb))
print("\nClassification Report:")
print(classification_report(y_test, preds_lgb))


[I 2025-07-01 17:01:29,910] A new study created in memory with name: no-name-bf0730bd-622c-43f9-8b58-3201d394021d
[I 2025-07-01 17:01:31,701] Trial 0 finished with value: 0.7483023702754644 and parameters: {'max_depth': 4, 'learning_rate': 0.23553809196119352, 'n_estimators': 471, 'num_leaves': 242, 'feature_fraction': 0.6498809518634763, 'bagging_fraction': 0.7467472269571367, 'bagging_freq': 7}. Best is trial 0 with value: 0.7483023702754644.
[I 2025-07-01 17:01:32,133] Trial 1 finished with value: 0.7556053811659192 and parameters: {'max_depth': 3, 'learning_rate': 0.13911311615537464, 'n_estimators': 162, 'num_leaves': 100, 'feature_fraction': 0.9622771696502093, 'bagging_fraction': 0.9408174011550657, 'bagging_freq': 4}. Best is trial 1 with value: 0.7556053811659192.
[I 2025-07-01 17:01:47,592] Trial 2 finished with value: 0.7457399103139013 and parameters: {'max_depth': 9, 'learning_rate': 0.25597204014753483, 'n_estimators': 619, 'num_leaves': 119, 'feature_fraction': 0.6313683

Best Trial:
FrozenTrial(number=25, state=TrialState.COMPLETE, values=[0.7706598334401025], datetime_start=datetime.datetime(2025, 7, 1, 17, 5, 20, 636672), datetime_complete=datetime.datetime(2025, 7, 1, 17, 5, 33, 239316), params={'max_depth': 10, 'learning_rate': 0.0330233031920151, 'n_estimators': 370, 'num_leaves': 175, 'feature_fraction': 0.9268252303501577, 'bagging_fraction': 0.9610706119206426, 'bagging_freq': 5}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=100, step=1), 'num_leaves': IntDistribution(high=256, log=False, low=31, step=1), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'bagging_fraction': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'bagging_freq': IntDistribution(high=7, log=False, low=1, st

## 🔍 Model 3: CatBoost with Optuna

In [35]:
from catboost import CatBoostClassifier, Pool

# Clean up categories — convert to string and fill NaN
X_train = X_train.copy()
X_test = X_test.copy()

for col in X_train.select_dtypes(include='category').columns:
    X_train[col] = X_train[col].astype(str).fillna("missing")
    X_test[col] = X_test[col].astype(str).fillna("missing")

# Define categorical feature names (not indices)
cat_features = X_train.select_dtypes(include='object').columns.tolist()

# Define Optuna objective for CatBoost
def objective_cat(trial):
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'Accuracy',
        'verbose': 0,
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1.0, 10.0),
        'random_strength': trial.suggest_float('random_strength', 1.0, 10.0),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'border_count': trial.suggest_int('border_count', 32, 255)
    }

    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    test_pool = Pool(X_test, y_test, cat_features=cat_features)

    model = CatBoostClassifier(**params)
    model.fit(train_pool)
    preds = model.predict(test_pool)
    return accuracy_score(y_test, preds)

# Run Optuna
study_cat = optuna.create_study(direction='maximize')
study_cat.optimize(objective_cat, n_trials=30)

# Final model training
print("Best Trial:")
print(study_cat.best_trial)

best_params_cat = study_cat.best_params
final_cat = CatBoostClassifier(**best_params_cat, verbose=0)
final_cat.fit(Pool(X_train, y_train, cat_features=cat_features))
preds_cat = final_cat.predict(Pool(X_test, cat_features=cat_features))

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, preds_cat))
print("\nClassification Report:")
print(classification_report(y_test, preds_cat))


[I 2025-07-01 17:14:23,310] A new study created in memory with name: no-name-14804748-d3f5-4daf-b02b-6fb983a55980
[I 2025-07-01 17:14:30,069] Trial 0 finished with value: 0.7762972453555413 and parameters: {'depth': 7, 'learning_rate': 0.27942876055489146, 'iterations': 385, 'l2_leaf_reg': 3.7666709530600704, 'random_strength': 2.869201368913176, 'bagging_temperature': 0.7133209374572772, 'border_count': 160}. Best is trial 0 with value: 0.7762972453555413.
[I 2025-07-01 17:14:33,212] Trial 1 finished with value: 0.7744394618834081 and parameters: {'depth': 6, 'learning_rate': 0.16248946100352665, 'iterations': 216, 'l2_leaf_reg': 6.4093189446122425, 'random_strength': 4.159365909459203, 'bagging_temperature': 0.8789546266081864, 'border_count': 101}. Best is trial 0 with value: 0.7762972453555413.
[I 2025-07-01 17:14:39,105] Trial 2 finished with value: 0.7750800768737989 and parameters: {'depth': 8, 'learning_rate': 0.20498379926146884, 'iterations': 287, 'l2_leaf_reg': 4.03881863561

Best Trial:
FrozenTrial(number=20, state=TrialState.COMPLETE, values=[0.7764253683536195], datetime_start=datetime.datetime(2025, 7, 1, 17, 17, 10, 253043), datetime_complete=datetime.datetime(2025, 7, 1, 17, 17, 18, 825554), params={'depth': 9, 'learning_rate': 0.23369341715704162, 'iterations': 328, 'l2_leaf_reg': 5.557767576085904, 'random_strength': 3.676883873061694, 'bagging_temperature': 0.8291315217645151, 'border_count': 110}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'depth': IntDistribution(high=10, log=False, low=4, step=1), 'learning_rate': FloatDistribution(high=0.3, log=False, low=0.01, step=None), 'iterations': IntDistribution(high=1000, log=False, low=100, step=1), 'l2_leaf_reg': FloatDistribution(high=10.0, log=False, low=1.0, step=None), 'random_strength': FloatDistribution(high=10.0, log=False, low=1.0, step=None), 'bagging_temperature': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'border_count': IntDistribution(high=255,

## Model 4: GradientBoostingClassifier with Optuna

In [39]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder


X = X.copy()  # Just to be safe

for col in X.select_dtypes(include='object').columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))  # Convert NaN and objects to string first

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Optuna objective function
def objective_gb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),  # Lower
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.2),  # Focused range
        'max_depth': trial.suggest_int('max_depth', 3, 6),  # Shallower trees
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 5),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 3),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2'])  # No None
    }

    model = GradientBoostingClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)


# Run Optuna
study_gb = optuna.create_study(direction='maximize')
study_gb.optimize(objective_gb, n_trials=30)

# Final model
print("Best Trial:")
print(study_gb.best_trial)

best_params_gb = study_gb.best_params
final_gb = GradientBoostingClassifier(**best_params_gb, random_state=42)
final_gb.fit(X_train, y_train)
preds_gb = final_gb.predict(X_test)

# Evaluation
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, preds_gb))
print("\nClassification Report:")
print(classification_report(y_test, preds_gb))


[I 2025-07-01 17:37:08,881] A new study created in memory with name: no-name-64606dd3-e0b7-4bfc-b311-fc595bc391e8
[I 2025-07-01 17:37:11,611] Trial 0 finished with value: 0.7518898142216528 and parameters: {'n_estimators': 146, 'learning_rate': 0.15723376816196039, 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 3, 'subsample': 0.9076467585015491, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7518898142216528.
[I 2025-07-01 17:37:15,523] Trial 1 finished with value: 0.7441383728379244 and parameters: {'n_estimators': 289, 'learning_rate': 0.09910592352981151, 'max_depth': 3, 'min_samples_split': 5, 'min_samples_leaf': 2, 'subsample': 0.8128949041834826, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.7518898142216528.
[I 2025-07-01 17:37:20,057] Trial 2 finished with value: 0.7574631646380525 and parameters: {'n_estimators': 273, 'learning_rate': 0.1804533457435601, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 3, 'subsample': 0.900384375727321

Best Trial:
FrozenTrial(number=29, state=TrialState.COMPLETE, values=[0.7645739910313901], datetime_start=datetime.datetime(2025, 7, 1, 17, 39, 17, 839996), datetime_complete=datetime.datetime(2025, 7, 1, 17, 39, 24, 469548), params={'n_estimators': 286, 'learning_rate': 0.10561143960860893, 'max_depth': 6, 'min_samples_split': 5, 'min_samples_leaf': 2, 'subsample': 0.7863867061472648, 'max_features': 'sqrt'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=300, log=False, low=100, step=1), 'learning_rate': FloatDistribution(high=0.2, log=False, low=0.05, step=None), 'max_depth': IntDistribution(high=6, log=False, low=3, step=1), 'min_samples_split': IntDistribution(high=5, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=3, log=False, low=1, step=1), 'subsample': FloatDistribution(high=1.0, log=False, low=0.7, step=None), 'max_features': CategoricalDistribution(choices=('sqrt', 'log2'))}, trial_id=29, value

### Model Leaderboard (Based on Classification Reports)

| Model                | Accuracy | Precision | Recall | F1 Score |
|----------------------|----------|-----------|--------|----------|
| **CatBoost**         | 0.78     | 0.79      | 0.69   | 0.74     |
| **LightGBM**         | 0.77     | 0.78      | 0.69   | 0.73     |
| **XGBoost**          | 0.76     | 0.77      | 0.68   | 0.72     |
| **GradientBoosting** | 0.76     | 0.78      | 0.67   | 0.72     |

**Notes:**
- All models were tuned using Optuna.
- Input data and evaluation method were consistent across models.
- CatBoost achieved the highest F1-score and accuracy overall.
