# Predicting Diabetes using Gradient boosting classifier

In [1]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
df = pd.read_csv("dataset/diabetes.csv")
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
X_train.shape, X_test.shape

((614, 8), (154, 8))

In [6]:
def replace_with_nan(X: pd.DataFrame, *, columns):
    X.loc[:, columns] = X.loc[:, columns].replace(0, np.nan)
    return X

In [7]:
def remove_columns(X: pd.DataFrame, *, columns):
    return X.drop(columns=columns)

In [8]:
from typing import List, Mapping

from sklearn.base import BaseEstimator, TransformerMixin


class FeatureInteractions(BaseEstimator, TransformerMixin):
    def __init__(self, feature_interactions: Mapping[str, List[str]]):
        self.feature_interactions = feature_interactions
        self.added_columns_ = []

    def fit(self, X, y, **fitparams):
        return self

    def transform(self, X):
        for feat1 in self.feature_interactions:
            for feat2 in self.feature_interactions[feat1]:
                self.added_columns_.append(f"{feat1}_{feat2}")
                X.loc[:, f"{feat1}_{feat2}"] = X[feat1] * X[feat2]
        return X

In [9]:
from sklearn.impute import KNNImputer


class KNNDataFrameImputer(KNNImputer):
    def transform(self, X):
        return pd.DataFrame(super().transform(X), columns=X.columns)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler

missing_value_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

feature_interactions = {
    "Glucose": [
        "Age",
        "Pregnancies",
        "Insulin",
        "BMI",
        "BloodPressure",
        "DiabetesPedigreeFunction",
    ],
    "Insulin": ["Age", "BMI", "DiabetesPedigreeFunction"],
}

features_to_drop = ["SkinThickness", "Pregnancies", "BloodPressure"]

preprocessing_pipeline = Pipeline(
    steps=[
        (
            "nan_marker",
            FunctionTransformer(replace_with_nan, kw_args={"columns": missing_value_cols}),
        ),
        ("nan_imputer", KNNDataFrameImputer(n_neighbors=10)),
        ("feature_interactions", FeatureInteractions(feature_interactions)),
        (
            "features_remover",
            FunctionTransformer(remove_columns, kw_args={"columns": features_to_drop}),
        ),
        ("scaler", StandardScaler()),
    ]
)

In [11]:
from sklearn.ensemble import GradientBoostingClassifier

model_pipeline = Pipeline(
    steps=[
        ("data_preprocessing", preprocessing_pipeline),
        ("model", GradientBoostingClassifier(random_state=42)),
    ]
)

In [12]:
import warnings

# grid search
from sklearn.model_selection import RandomizedSearchCV

warnings.simplefilter("ignore")

gb_clf_params = {
    "model__n_estimators": range(100, 1000, 50),
    "model__learning_rate": [0.01, 0.1, 0.5, 1.0],
    "model__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "model__max_features": [5, 7, 9, 11],
    "model__max_depth": range(2, 5),
    "model__min_samples_split": [2, 5, 10],
}

# random_cv = RandomizedSearchCV(model_pipeline, gb_clf_params, n_iter = 100, cv=10, random_state=42, n_jobs=-1, verbose=2, scoring="roc_auc")
# random_cv.fit(X_train, y_train)
# random_cv.best_params_

In [13]:
best_performing_params = {
    "model__subsample": 0.8,
    "model__n_estimators": 750,
    "model__min_samples_split": 5,
    "model__max_features": 5,
    "model__max_depth": 2,
    "model__learning_rate": 0.01,
}
model_pipeline.set_params(**best_performing_params)

Pipeline(steps=[('data_preprocessing',
                 Pipeline(steps=[('nan_marker',
                                  FunctionTransformer(func=<function replace_with_nan at 0x7f91b3d77c10>,
                                                      kw_args={'columns': ['Glucose',
                                                                           'BloodPressure',
                                                                           'SkinThickness',
                                                                           'Insulin',
                                                                           'BMI']})),
                                 ('nan_imputer',
                                  KNNDataFrameImputer(n_neighbors=10)),
                                 ('feature_interactions',
                                  FeatureInteractions(feature_interactions={'Glucose': ['Ag...
                                                                                        'Di

In [17]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
scores = cross_val_score(model_pipeline, X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
scores.mean(), scores.std()

(0.8531962481962483, 0.039654106303011916)

In [15]:
# train tuned model
model_pipeline.fit(X_train, y_train)

Pipeline(steps=[('data_preprocessing',
                 Pipeline(steps=[('nan_marker',
                                  FunctionTransformer(func=<function replace_with_nan at 0x7f91b3d77c10>,
                                                      kw_args={'columns': ['Glucose',
                                                                           'BloodPressure',
                                                                           'SkinThickness',
                                                                           'Insulin',
                                                                           'BMI']})),
                                 ('nan_imputer',
                                  KNNDataFrameImputer(n_neighbors=10)),
                                 ('feature_interactions',
                                  FeatureInteractions(feature_interactions={'Glucose': ['Ag...
                                                                                        'Di

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_auc_score

y_test_pred = model_pipeline.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"ROC AUC score: {roc_auc_score(y_test, y_test_pred)}")
print(f"F1 score: {f1_score(y_test, y_test_pred)}")
print(f"Confusion Matrix: {confusion_matrix(y_test, y_test_pred)}")

Accuracy: 0.7207792207792207
ROC AUC score: 0.6785185185185186
F1 score: 0.5742574257425743
Confusion Matrix: [[82 18]
 [25 29]]
