In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna

from sklearn.model_selection import StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from sklearn.metrics import roc_auc_score

In [None]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 26 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   id                                  700000 non-null  int64  
 1   age                                 700000 non-null  int64  
 2   alcohol_consumption_per_week        700000 non-null  int64  
 3   physical_activity_minutes_per_week  700000 non-null  int64  
 4   diet_score                          700000 non-null  float64
 5   sleep_hours_per_day                 700000 non-null  float64
 6   screen_time_hours_per_day           700000 non-null  float64
 7   bmi                                 700000 non-null  float64
 8   waist_to_hip_ratio                  700000 non-null  float64
 9   systolic_bp                         700000 non-null  int64  
 10  diastolic_bp                        700000 non-null  int64  
 11  heart_rate                

In [None]:
df_train.head()

Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0


In [None]:
print(df_train.shape, df_test.shape)

(700000, 26) (300000, 25)


In [None]:
target_cols_to_bin = [
    "age",
    "physical_activity_minutes_per_week",
    "cholesterol_total",
    "hdl_cholesterol",
    "ldl_cholesterol",
    "triglycerides",
    "diet_score",
    "screen_time_hours_per_day",
    "bmi"
] # Reduced sample based on feature importance of Catboost model with threshold bins

drop_cols = ['id', 'diagnosed_diabetes']

### 1. Final models

Based on cross-validation results and feature importance analysis, the final solution uses **two gradient boosting models**:

#### a. CatBoost (with selective threshold bins)
CatBoost is trained using **threshold-binned features only for the most important predictors** (as identified from feature importance plots).  
This provides a small but consistent AUC improvement, likely because binned versions capture clinically meaningful cut-offs and reduce noise while preserving strong signals.

#### b. LightGBM (raw features only)
For LightGBM, we use **raw continuous features without binning**.  
While we only explored LGB with additional features in previous notebooks, we found through trial and error that adding binned features reduced AUC, suggesting that LGBM learns optimal split thresholds directly from the raw feature space and performs better without discretization.

### 2. Hyperparameter tuning with Optuna

We tune key hyperparameters using Optuna to balance **model capacity**, **generalization**, and **training stability**.

#### a. CatBoost tuned parameters
- `depth` (4–8): controls tree depth → higher depth increases interaction learning capacity but raises overfitting risk.
- `learning_rate` (0.01–0.2, log): step size for boosting → smaller values improve stability but require more iterations.
- `l2_leaf_reg` (1–20, log): L2 regularization on leaf values → reduces overfitting, especially when trees are deep.
- `one_hot_max_size` (2–10): threshold for one-hot encoding categorical features → controls categorical handling complexity and can affect generalization.

#### b. LightGBM tuned parameters
- `learning_rate` (0.01–0.2, log): controls boosting step size and convergence stability.
- `num_leaves` (30–80): determines tree complexity → main driver of model capacity and non-linearity.
- `max_depth` (5–10): depth constraint → prevents overly complex trees and helps generalization.
- `min_child_samples` (30–100): minimum observations per leaf → regularizes the tree structure to reduce overfitting.

Overall, these parameters are tuned because they directly control the **bias–variance tradeoff**, affecting how well each model captures non-linear risk patterns while maintaining robust CV performance. We did not tune additional params due to runtime constraints.

In [None]:
# 1. Helper functions
def fit_ai_binning(X, y, col, max_depth=3):
    # Use decision tree splits as thresholds for bins, max_depth = 3 for now

    # 1. We fill the column of interesrt with mean if there are NAs, for stability of tree
    X_clean = X[[col]].fillna(X[col].mean())

    # 2. DT training and fitting
    dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=0.05, random_state=5)
    dt.fit(X_clean, y)

    # 3. Extract non-leaf thresholds (-2) from the tree structure
    thresholds = [t for t in dt.tree_.threshold if t != -2]

    # 4. Sort and add safety edges
    return [-np.inf] + sorted(thresholds) + [np.inf]

def apply_binning(series, bins):
    return pd.cut(series, bins=bins, labels=False, include_lowest=True).astype(str)

In [None]:
def objective_lgb(trial):
    # 1. Suggest hyperparameters
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 30, 80),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 30, 100),
        "device_type": "gpu",
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'seed': 5,
        'is_unbalance': True
    }

    # 2. Init CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
    cv_scores = []

    for train_idx, val_idx in skf.split(X, y):
        # 3. Create fold data
        X_train_fold = X.iloc[train_idx].copy()
        y_train_fold = y.iloc[train_idx].copy()
        X_val_fold = X.iloc[val_idx].copy()
        y_val_fold = y.iloc[val_idx].copy()

        fold_cat_features = cat_features.copy()

        # 5. Train lgb model for each fold
        dtrain = lgb.Dataset(X_train_fold, label=y_train_fold, categorical_feature=fold_cat_features)
        dvalid = lgb.Dataset(X_val_fold, label=y_val_fold, categorical_feature=fold_cat_features, reference=dtrain)

        model = lgb.train(
            params,
            dtrain,
            num_boost_round=3000, # iterations
            valid_sets=[dvalid],
            callbacks=[
                lgb.early_stopping(stopping_rounds=50, verbose=False),
                lgb.log_evaluation(period=0)
            ]
        )

        # 6. Validation using X_vald_fol and y_val_fold
        preds = model.predict(X_val_fold, num_iteration=model.best_iteration)
        auc = roc_auc_score(y_val_fold, preds)
        cv_scores.append(auc)

    return np.mean(cv_scores)

In [None]:
def objective_catboost(trial):
    # 1. Suggest hyperparameters
    params = {
        "depth": trial.suggest_int("depth", 4, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 20, log=True),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 10),
        "loss_function": "Logloss",
        "eval_metric": "Logloss",
        "iterations": 3000,
        "early_stopping_rounds": 50,
        "random_seed": 5,
        "verbose": False,
        "auto_class_weights": "Balanced",
        "task_type": "GPU",
        "devices": "0"
        }

    # 2. Init CV
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
    cv_scores = []

    for train_idx, val_idx in skf.split(X, y):
        # 3. Create fold data
        X_train_fold = X.iloc[train_idx].copy()
        y_train_fold = y.iloc[train_idx].copy()
        X_val_fold = X.iloc[val_idx].copy()
        y_val_fold = y.iloc[val_idx].copy()

        fold_cat_features = cat_features.copy()

        for col in target_cols_to_bin:
            # 4. For each column to bin, run fit_ai_binning
            bins = fit_ai_binning(X_train_fold, y_train_fold, col)

            # 4.1 Then apply to X_train_fold and X_val_fold
            feature_name = f'{col}_ai_bin'
            X_train_fold[feature_name] = apply_binning(X_train_fold[col], bins)
            X_val_fold[feature_name] = apply_binning(X_val_fold[col], bins)

            fold_cat_features.append(feature_name)

        # 5. Train catboost model for each fold
        train_pool = Pool(X_train_fold, y_train_fold, cat_features=fold_cat_features)
        val_pool = Pool(X_val_fold, y_val_fold, cat_features=fold_cat_features)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, use_best_model=True)

        # 6. Validation using X_vald_fol and y_val_fold
        preds = model.predict_proba(X_val_fold)[:, 1]
        cv_scores.append(roc_auc_score(y_val_fold, preds))

    return np.mean(cv_scores)

In [None]:
# 2. Prepare data
X = df_train.drop(columns=drop_cols, errors='ignore')
y = df_train['diagnosed_diabetes']

X_test = df_test.drop(columns=drop_cols, errors='ignore')

cat_features = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
cat_features += ["family_history_diabetes", "hypertension_history", "cardiovascular_history"]

for col in X.columns:
  if X[col].dtype.name == 'object':
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [None]:
# 3. Optuna HT
study_catboost = optuna.create_study(direction='maximize')
study_catboost.optimize(objective_catboost, n_trials=10)
print("Catboost complete!")
print("\n")

study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=10)
print("LGB complete!")
print("\n")

print(f"Best CatBoost AUC: {study_catboost.best_value:.6f}")
print(f"Best CatBoost Params: {study_catboost.best_params}")
print("="*30)
print(f"Best LightGBM AUC: {study_lgb.best_value:.6f}")
print(f"Best LightGBM Params: {study_lgb.best_params}")

[I 2026-01-18 05:19:28,970] A new study created in memory with name: no-name-8a1bdcfc-7c4d-4782-b6e2-54b1df7c817d
[I 2026-01-18 05:20:54,355] Trial 0 finished with value: 0.7247611891204271 and parameters: {'depth': 6, 'learning_rate': 0.1507462563776378, 'l2_leaf_reg': 2, 'one_hot_max_size': 8}. Best is trial 0 with value: 0.7247611891204271.
[I 2026-01-18 05:24:16,717] Trial 1 finished with value: 0.7233330945620084 and parameters: {'depth': 5, 'learning_rate': 0.018379316083790474, 'l2_leaf_reg': 5, 'one_hot_max_size': 9}. Best is trial 0 with value: 0.7247611891204271.
[I 2026-01-18 05:27:36,698] Trial 2 finished with value: 0.7258609641757492 and parameters: {'depth': 5, 'learning_rate': 0.052194883186529706, 'l2_leaf_reg': 9, 'one_hot_max_size': 8}. Best is trial 2 with value: 0.7258609641757492.
[I 2026-01-18 05:36:20,459] Trial 3 finished with value: 0.7259526622387984 and parameters: {'depth': 4, 'learning_rate': 0.10409371160202006, 'l2_leaf_reg': 2, 'one_hot_max_size': 5}. B

Catboost complete!




[I 2026-01-18 07:15:53,584] Trial 0 finished with value: 0.7269563656572243 and parameters: {'learning_rate': 0.010728990647576297, 'num_leaves': 69, 'max_depth': 7, 'min_child_samples': 92}. Best is trial 0 with value: 0.7269563656572243.
[I 2026-01-18 07:22:39,140] Trial 1 finished with value: 0.7273120852674232 and parameters: {'learning_rate': 0.04050369958823009, 'num_leaves': 53, 'max_depth': 7, 'min_child_samples': 91}. Best is trial 1 with value: 0.7273120852674232.
[I 2026-01-18 07:26:13,038] Trial 2 finished with value: 0.7266457237872587 and parameters: {'learning_rate': 0.07017036076072067, 'num_leaves': 54, 'max_depth': 10, 'min_child_samples': 71}. Best is trial 1 with value: 0.7273120852674232.
[I 2026-01-18 07:40:21,579] Trial 3 finished with value: 0.7269209264290662 and parameters: {'learning_rate': 0.015293596331462664, 'num_leaves': 56, 'max_depth': 9, 'min_child_samples': 95}. Best is trial 1 with value: 0.7273120852674232.
[I 2026-01-18 07:48:26,354] Trial 4 finis

LGB complete!


Best CatBoost AUC: 0.725953
Best CatBoost Params: {'depth': 4, 'learning_rate': 0.10409371160202006, 'l2_leaf_reg': 2, 'one_hot_max_size': 5}
Best LightGBM AUC: 0.727312
Best LightGBM Params: {'learning_rate': 0.04050369958823009, 'num_leaves': 53, 'max_depth': 7, 'min_child_samples': 91}


In [None]:
best_params_catboost = {'depth': 4, 'learning_rate': 0.10409371160202006, 'l2_leaf_reg': 2, 'one_hot_max_size': 5}
best_params_lgb = {'learning_rate': 0.04050369958823009, 'num_leaves': 53, 'max_depth': 7, 'min_child_samples': 91}

In [None]:
# 4. Train final model
# Catboost
cb_cat_features = cat_features.copy()
X_cb_train = X.copy()
X_cb_test = X_test.copy()

# Re-fit bins on 100% training data for CatBoost
for col in target_cols_to_bin:
    # Fit DT
    bins = fit_ai_binning(X_cb_train, y, col)

    # Apply to train and test
    feature_name = f'{col}_ai_bin'
    X_cb_train[feature_name] = apply_binning(X_cb_train[col], bins)
    X_cb_test[feature_name] = apply_binning(X_cb_test[col], bins)

    cb_cat_features.append(feature_name)

# 1. Obtain best params for CatBoost
final_cb_params = {
    "iterations": 3000,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "random_seed": 5,
    "thread_count": -1,
    "verbose": 500,
    "auto_class_weights": "Balanced",
    **best_params_catboost
}

# 2. Train CatBoost model
train_pool = Pool(X_cb_train, y, cat_features=cb_cat_features)
cb_model = CatBoostClassifier(**final_cb_params)
cb_model.fit(train_pool)

0:	total: 1.72s	remaining: 1h 26m 5s
500:	total: 10m 18s	remaining: 51m 26s
1000:	total: 20m 38s	remaining: 41m 12s
1500:	total: 30m 59s	remaining: 30m 56s
2000:	total: 41m 22s	remaining: 20m 39s
2500:	total: 51m 52s	remaining: 10m 21s
2999:	total: 1h 2m 26s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7b6b888a9250>

In [None]:
# LightGBM
# 0. Prepare data for LGBM
lgb_cat_features = cat_features.copy()
X_lgb_train = X.copy()
X_lgb_test = X_test.copy()

# 1. Obtain best params for LGBM
final_lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'verbosity': 500,
    'seed': 5,
    'is_unbalance': True,
    **best_params_lgb
}

# 2. Train LGBM model
dtrain_final = lgb.Dataset(X_lgb_train, label=y, categorical_feature=lgb_cat_features)

lgb_model = lgb.train(
    final_lgb_params,
    dtrain_final,
    num_boost_round=3000,
    callbacks=[lgb.log_evaluation(period=500)]
)

[LightGBM] [Info] Number of positive: 436307, number of negative: 263693
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.816745
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.211965
[LightGBM] [Debug] init for col-wise cost 0.038538 seconds, init for row-wise cost 0.153631 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1652
[LightGBM] [Info] Number of data points in the train set: 700000, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.623296 -> initscore=0.503561
[LightGBM] [Info] Start training from score 0.503561
[LightGBM] [Debug] Trained a tree with leaves = 53 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 53 and depth = 7
[LightGBM

In [None]:
# 5. Final predictions
catboost_preds = cb_model.predict_proba(X_cb_test)[:, 1]
lgb_preds = lgb_model.predict(X_lgb_test)

In [None]:
# Create Submission File
submission_catboost = pd.DataFrame({
    'id': df_test['id'],
    'diagnosed_diabetes': catboost_preds
})

submission_catboost.to_csv('submission_catboost.csv', index=False)
print("submission_catboost.csv saved!")

submission_catboost.csv saved!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Create Submission File
submission_lgb = pd.DataFrame({
    'id': df_test['id'],
    'diagnosed_diabetes': catboost_preds
})

submission_lgb.to_csv('submission_lgb.csv', index=False)
print("submission_lgb.csv saved!")

submission_lgb.csv saved!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>