# Imports, Data Loading, and Preprocessing

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold,
    cross_validate,
    GridSearchCV,
    RandomizedSearchCV
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    RocCurveDisplay
)

import joblib
import scipy.stats as st

from xgboost import XGBClassifier

df = pd.read_csv("/content/diabetes_prediction_dataset.csv")

print(df.head())
print(df["diabetes"].value_counts(normalize=True))

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  
diabetes
0    0.915
1    0.085
Name: proportion, dtype: float64


In [2]:
feature_cols = [
    "gender",
    "age",
    "hypertension",
    "heart_disease",
    "smoking_history",
    "bmi",
    "HbA1c_level",
    "blood_glucose_level"
]

X = df[feature_cols]
y = df["diabetes"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

Train size: 80000, Test size: 20000


In [3]:
categorical_features = ["gender", "smoking_history"]
numeric_features = [
    "age",
    "hypertension",
    "heart_disease",
    "bmi",
    "HbA1c_level",
    "blood_glucose_level"
]

categorical_transformer = OneHotEncoder(handle_unknown="ignore")
numeric_transformer = StandardScaler()

# using template preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("num", numeric_transformer, numeric_features),
    ]
)

# Model Evaluation

In [4]:
def evaluate_model(pipeline, X_train, y_train, X_test, y_test, cv_splits=5, model_name="model"):
    cross_validation = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)

    scoring = {
        "accuracy": "accuracy",
        "precision": "precision",
        "recall": "recall",
        "f1": "f1",
        "roc_auc": "roc_auc",
    }

    cv_results = cross_validate(
        pipeline,
        X_train,
        y_train,
        cv=cross_validation,
        scoring=scoring,
        n_jobs=-1
    )

    print(f"\n----------- {model_name}: Cross-Validation -----------")
    for metric in scoring.keys():
        scores = cv_results[f"test_{metric}"]
        print(f"{metric:9}: {scores.mean():.4f}")

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    if hasattr(pipeline, "predict_proba"):
        y_proba = pipeline.predict_proba(X_test)[:, 1]
    else:
        y_proba = None

    print(f"\n----------- {model_name}: Test Set -----------")
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_proba)
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print(f"ROC-AUC  : {roc_auc:.3f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, digits=3))

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    return {
        "model_name": model_name,
        "cv_accuracy": cv_results["test_accuracy"].mean(),
        "cv_roc_auc": cv_results["test_roc_auc"].mean(),
        "test_accuracy": acc,
        "test_roc_auc": roc_auc
    }


# Logistic Regression

In [5]:
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs"
)

log_reg_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", log_reg)
    ]
)

results = []
res_lr = evaluate_model(
    log_reg_pipeline,
    X_train,
    y_train,
    X_test,
    y_test,
    cv_splits=5,
    model_name="Logistic Regression"
)
results.append(res_lr)



----------- Logistic Regression: Cross-Validation -----------
accuracy : 0.8870
precision: 0.4214
recall   : 0.8797
f1       : 0.5697
roc_auc  : 0.9620

----------- Logistic Regression: Test Set -----------
Accuracy : 0.8888
Precision: 0.4264
Recall   : 0.8935
F1-score : 0.5773
ROC-AUC  : 0.963

Classification Report:
              precision    recall  f1-score   support

           0      0.989     0.888     0.936     18300
           1      0.426     0.894     0.577      1700

    accuracy                          0.889     20000
   macro avg      0.708     0.891     0.757     20000
weighted avg      0.941     0.889     0.905     20000

Confusion Matrix:
[[16257  2043]
 [  181  1519]]


# Random Forest Classifier

In [6]:
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight="balanced"
)

rf_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", rf)
    ]
)

res_rf = evaluate_model(
    rf_pipeline,
    X_train,
    y_train,
    X_test,
    y_test,
    cv_splits=5,
    model_name="Random Forest"
)
results.append(res_rf)


----------- Random Forest: Cross-Validation -----------
accuracy : 0.9697
precision: 0.9384
recall   : 0.6882
f1       : 0.7941
roc_auc  : 0.9649

----------- Random Forest: Test Set -----------
Accuracy : 0.9696
Precision: 0.9333
Recall   : 0.6912
F1-score : 0.7942
ROC-AUC  : 0.965

Classification Report:
              precision    recall  f1-score   support

           0      0.972     0.995     0.984     18300
           1      0.933     0.691     0.794      1700

    accuracy                          0.970     20000
   macro avg      0.953     0.843     0.889     20000
weighted avg      0.969     0.970     0.967     20000

Confusion Matrix:
[[18216    84]
 [  525  1175]]


# XGBoost

In [7]:
xgb = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

xgb_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", xgb)
    ]
)

res_xgb = evaluate_model(
    xgb_pipeline,
    X_train, y_train,
    X_test, y_test,
    cv_splits=5,
    model_name="XGBoost"
)

results.append(res_xgb)


----------- XGBoost: Cross-Validation -----------
accuracy : 0.9721
precision: 0.9796
recall   : 0.6854
f1       : 0.8065
roc_auc  : 0.9794

----------- XGBoost: Test Set -----------
Accuracy : 0.9725
Precision: 0.9768
Recall   : 0.6924
F1-score : 0.8103
ROC-AUC  : 0.980

Classification Report:
              precision    recall  f1-score   support

           0      0.972     0.998     0.985     18300
           1      0.977     0.692     0.810      1700

    accuracy                          0.972     20000
   macro avg      0.974     0.845     0.898     20000
weighted avg      0.973     0.972     0.970     20000

Confusion Matrix:
[[18272    28]
 [  523  1177]]


# Neural Network

In [8]:
mlp = MLPClassifier(
    hidden_layer_sizes=(32, 16),
    activation="relu",
    max_iter=300,
    random_state=42
)

mlp_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", mlp)
    ]
)

res_mlp = evaluate_model(
    mlp_pipeline,
    X_train,
    y_train,
    X_test,
    y_test,
    cv_splits=5,
    model_name="Neural Network"
)
results.append(res_mlp)


----------- Neural Network: Cross-Validation -----------
accuracy : 0.9712
precision: 0.9632
recall   : 0.6869
f1       : 0.8018
roc_auc  : 0.9764

----------- Neural Network: Test Set -----------
Accuracy : 0.9721
Precision: 0.9872
Recall   : 0.6806
F1-score : 0.8057
ROC-AUC  : 0.977

Classification Report:
              precision    recall  f1-score   support

           0      0.971     0.999     0.985     18300
           1      0.987     0.681     0.806      1700

    accuracy                          0.972     20000
   macro avg      0.979     0.840     0.895     20000
weighted avg      0.973     0.972     0.970     20000

Confusion Matrix:
[[18285    15]
 [  543  1157]]


# Model Selection

In [9]:
results_df = pd.DataFrame(results)
print("\n----------- Model Comparison (going by ROC Area Under Curve) -----------")
print(results_df.sort_values(by="cv_roc_auc", ascending=False))

top2 = results_df.sort_values(by="cv_roc_auc", ascending=False).head(2)
print("\n----------- Top 2 models -----------")
print(top2)


----------- Model Comparison (going by ROC Area Under Curve) -----------
            model_name  cv_accuracy  cv_roc_auc  test_accuracy  test_roc_auc
2              XGBoost     0.972050    0.979362        0.97245      0.979729
3       Neural Network     0.971150    0.976368        0.97210      0.977447
1        Random Forest     0.969663    0.964904        0.96955      0.964561
0  Logistic Regression     0.886975    0.961996        0.88880      0.962942

----------- Top 2 models -----------
       model_name  cv_accuracy  cv_roc_auc  test_accuracy  test_roc_auc
2         XGBoost      0.97205    0.979362        0.97245      0.979729
3  Neural Network      0.97115    0.976368        0.97210      0.977447


# XGBoost (with Hyperparameter Tuning) **SELECTED MODEL**

In [12]:
xgb_param_grid = {
    "model__n_estimators": [100, 200, 300],
    "model__max_depth": [3, 4, 5],
    "model__learning_rate": [0.1, 0.05]
}

xgb_base = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", XGBClassifier(
            random_state=42,
            eval_metric="logloss",
            n_jobs=-1
        ))
    ]
)

xgb_grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=xgb_param_grid,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=1
)

xgb_grid_search.fit(X_train, y_train)

print("\nBest XGBoost params:", xgb_grid_search.best_params_)
print("Best XGBoost CV ROC-AUC:", xgb_grid_search.best_score_)

best_xgb_pipeline = xgb_grid_search.best_estimator_

res_xgb_tuned = evaluate_model(
    best_xgb_pipeline,
    X_train,
    y_train,
    X_test,
    y_test,
    cv_splits=5,
    model_name="XGBoost (tuned)"
)

results.append(res_xgb_tuned)

Fitting 5 folds for each of 18 candidates, totalling 90 fits

Best XGBoost params: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__n_estimators': 200}
Best XGBoost CV ROC-AUC: 0.9794845457650274

----------- XGBoost (tuned): Cross-Validation -----------
accuracy : 0.9722
precision: 0.9839
recall   : 0.6837
f1       : 0.8067
roc_auc  : 0.9795

----------- XGBoost (tuned): Test Set -----------
Accuracy : 0.9725
Precision: 0.9816
Recall   : 0.6894
F1-score : 0.8100
ROC-AUC  : 0.980

Classification Report:
              precision    recall  f1-score   support

           0      0.972     0.999     0.985     18300
           1      0.982     0.689     0.810      1700

    accuracy                          0.973     20000
   macro avg      0.977     0.844     0.898     20000
weighted avg      0.973     0.973     0.970     20000

Confusion Matrix:
[[18278    22]
 [  528  1172]]


# Neural Network (with Hyperparameter Tuning)

In [15]:
mlp_param_dist = {
    "model__hidden_layer_sizes": [
        (32, 16),
        (64, 32),
    ],
    "model__alpha": [1e-4, 1e-3],
    "model__learning_rate_init": [0.001, 0.005]
}

mlp_base = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", MLPClassifier(
            max_iter=100,
            activation="relu",
            random_state=42
        ))
    ]
)

mlp_random_search = RandomizedSearchCV(
    estimator=mlp_base,
    param_distributions=mlp_param_dist,
    n_iter=4,
    scoring="roc_auc",
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

mlp_random_search.fit(X_train, y_train)

print("\nBest MLP params:", mlp_random_search.best_params_)
print("Best MLP CV ROC-AUC:", mlp_random_search.best_score_)

best_mlp_pipeline = mlp_random_search.best_estimator_

res_mlp_tuned = evaluate_model(
    best_mlp_pipeline,
    X_train,
    y_train,
    X_test,
    y_test,
    cv_splits=5,
    model_name="Neural Network (tuned)"
)

results.append(res_mlp_tuned)

Fitting 3 folds for each of 4 candidates, totalling 12 fits

Best MLP params: {'model__learning_rate_init': 0.001, 'model__hidden_layer_sizes': (32, 16), 'model__alpha': 0.0001}
Best MLP CV ROC-AUC: 0.9756358813602288

----------- Neural Network (tuned): Cross-Validation -----------
accuracy : 0.9712
precision: 0.9632
recall   : 0.6869
f1       : 0.8018
roc_auc  : 0.9764

----------- Neural Network (tuned): Test Set -----------
Accuracy : 0.9721
Precision: 0.9872
Recall   : 0.6806
F1-score : 0.8057
ROC-AUC  : 0.977

Classification Report:
              precision    recall  f1-score   support

           0      0.971     0.999     0.985     18300
           1      0.987     0.681     0.806      1700

    accuracy                          0.972     20000
   macro avg      0.979     0.840     0.895     20000
weighted avg      0.973     0.972     0.970     20000

Confusion Matrix:
[[18285    15]
 [  543  1157]]


# Ensemble Methods

In [16]:
ensemble = VotingClassifier(
    estimators=[
        ("xgb", best_xgb_pipeline),
        ("mlp", best_mlp_pipeline),
    ],
    voting="soft"
)

ensemble.fit(X_train, y_train)

y_pred_ens = ensemble.predict(X_test)
y_proba_ens = ensemble.predict_proba(X_test)[:, 1]

print("\n----------- Ensemble (XGBoost + NN): Test Set -----------")
print("Accuracy :", accuracy_score(y_test, y_pred_ens))
print("Precision:", precision_score(y_test, y_pred_ens, zero_division=0))
print("Recall   :", recall_score(y_test, y_pred_ens, zero_division=0))
print("F1-score :", f1_score(y_test, y_pred_ens, zero_division=0))
print("ROC-AUC  :", roc_auc_score(y_test, y_proba_ens))


----------- Ensemble (XGBoost + NN): Test Set -----------
Accuracy : 0.9726
Precision: 0.9889643463497453
Recall   : 0.6852941176470588
F1-score : 0.8095899930507297
ROC-AUC  : 0.9792338958534234


# Saving & Importing the Model

In [17]:
joblib.dump(best_xgb_pipeline, "best_xgb_pipeline.joblib")
joblib.dump(best_mlp_pipeline, f"best_mlp_pipeline.joblib")
joblib.dump(ensemble, "best_ensemble_voting.joblib")

['best_ensemble_voting.joblib']

In [18]:
model = joblib.load("best_ensemble_voting.joblib")