# Diabetes classification

In [19]:
import numpy
import pandas
import random

#models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier

#utils
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

#fixing imbalance, for method 2 below
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

#warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [20]:
#load cleaned and rebalanced dataset: requires diabetes_eda.ipynb to be run first. For use with predictions method 1 below
df = pandas.read_csv("diabetes-dataset-overunder.csv", index_col=0)
df.head(5)

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current,diabetes
0,0,0.003458,0,0,0.002846,0.003201,0.003076,False,False,True,False,False,0
1,0,0.002726,0,0,0.005184,0.003258,0.002768,False,False,False,True,False,0
2,1,0.002859,0,0,0.003073,0.003651,0.003076,False,False,False,True,False,0
3,1,0.003192,0,0,0.003116,0.001966,0.003405,False,False,False,True,False,0
4,0,0.002793,0,0,0.004459,0.003258,0.001867,False,False,False,True,False,0


##### 1. Split into train and test sets

In [21]:
#split into independent variables and target
X, y = df.drop(columns="diabetes"), df.loc[:, "diabetes"]

#training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=40,
    shuffle=True
)

##### 2. Predictions

In [22]:
seed = random.randint(1, 5)

ml_models = [
    ("SVM", SVC(random_state=seed)),
    ("Random Forest", RandomForestClassifier(random_state=seed)),
    ("K Nearest Neighbours", KNeighborsClassifier()),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=seed)),
    ("Logistic Regression", LogisticRegression(random_state=seed)),
    ("Dummy Classifier", DummyClassifier())
]

In [23]:
"""
-----------
PREDICTIONS: METHOD 1: USING THE OVERUNDER REBALANCED DATA GENERATED FROM diabetes_eda.ipynb AND SAVED TO CSV
-----------
"""
results = {}

for model_name, model in ml_models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    print(f"Sample of {model_name} predictions: {predictions[50:60]}\n")
    
    accuracy = accuracy_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    
    results[model_name] = [accuracy, roc_auc]
    
s = ""
for val in y_test[50:60]: s += str(val) + " "
print(f" -> Correct values for same sample: [{s}]")

Sample of SVM predictions: [1 0 0 1 1 0 1 0 0 0]

Sample of Random Forest predictions: [1 1 0 1 1 0 1 0 0 1]

Sample of K Nearest Neighbours predictions: [1 1 0 1 1 0 0 0 0 0]

Sample of Gradient Boosting predictions: [1 1 0 1 1 0 1 0 0 1]

Sample of Logistic Regression predictions: [1 0 0 1 1 0 1 0 0 0]

Sample of Dummy Classifier predictions: [0 0 0 0 0 0 0 0 0 0]

 -> Correct values for same sample: [0 1 0 1 1 1 1 0 0 1 ]


In [24]:
for x in results:
    print(f"{x}:\nROC AUC score: {results[x][1]}\nAccuracy score: {results[x][0]}\n\n")

SVM:
ROC AUC score: 0.6397609041809428
Accuracy score: 0.7323327990674632


Random Forest:
ROC AUC score: 0.9025086702673087
Accuracy score: 0.9181116129972315


K Nearest Neighbours:
ROC AUC score: 0.8717886718653393
Accuracy score: 0.8888241293894799


Gradient Boosting:
ROC AUC score: 0.9091378815060297
Accuracy score: 0.923648550196707


Logistic Regression:
ROC AUC score: 0.6397609041809428
Accuracy score: 0.7323327990674632


Dummy Classifier:
ROC AUC score: 0.5
Accuracy score: 0.6637039195687018




In [25]:
"""
-----------
PREDICTIONS: METHOD 2: OVER/UNDERSAMPLING DATA HERE USING PIPELINE INSTEAD OF USING THE EXPORTED DATA FROM diabetes_eda.ipynb
-----------
"""

df_2 = pandas.read_csv("diabetes-dataset-cleaned.csv", index_col=0)

X, y = df_2.drop(columns="diabetes"), df_2.loc[:, "diabetes"]

model = GradientBoostingClassifier()
oversample = SMOTE(sampling_strategy=0.1)
undersample = RandomUnderSampler(sampling_strategy=0.5)

resample_steps = [("o", oversample), ("u", undersample), ("model", model)]
pipeline = Pipeline(steps=resample_steps)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=40,
    shuffle=True
)

pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)

print(f"Accuracy score: {accuracy_score(y_test, predictions)}")
print(f"ROC AUC score: {roc_auc_score(y_test, predictions)}")

Accuracy score: 0.94848
ROC AUC score: 0.9003533509021234
