# Diabetes Outcome

Using:
- logistic regression
- SVM
- Decision Tree
- Random Forest
- XGBoost

In [510]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

np.random.seed(42)

In [511]:
def eval_metric(model, X_train, y_train, X_test, y_test):
    y_train_pred = model.predict(X_train)
    y_pred = model.predict(X_test)
    
    print("Test_Set")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print()
    print("Train_Set")
    print(confusion_matrix(y_train, y_train_pred))
    print(classification_report(y_train, y_train_pred))

In [512]:
df = pd.read_csv('diabetes.csv')
df.head(5).T

Unnamed: 0,0,1,2,3,4
Pregnancies,6.0,1.0,8.0,1.0,0.0
Glucose,148.0,85.0,183.0,89.0,137.0
BloodPressure,72.0,66.0,64.0,66.0,40.0
SkinThickness,35.0,29.0,0.0,23.0,35.0
Insulin,0.0,0.0,0.0,94.0,168.0
BMI,33.6,26.6,23.3,28.1,43.1
DiabetesPedigreeFunction,0.627,0.351,0.672,0.167,2.288
Age,50.0,31.0,32.0,21.0,33.0
Outcome,1.0,0.0,1.0,0.0,1.0


In [513]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [514]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [515]:
imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

X_train_imputed = imputer.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_imputed)

X_test_imputed = imputer.transform(X_test)
X_test_scaled = scaler.transform(X_test_imputed)

## Logistic Regression

In [516]:
from sklearn.linear_model import LogisticRegression

In [517]:
lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

lr_grid_search = GridSearchCV(
    LogisticRegression(random_state=42),
    lr_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

lr_grid_search.fit(X_train, y_train)
eval_metric(lr_grid_search, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Test_Set
[[82 18]
 [26 28]]
              precision    recall  f1-score   support

           0       0.76      0.82      0.79       100
           1       0.61      0.52      0.56        54

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.71      0.71      0.71       154


Train_Set
[[359  41]
 [ 86 128]]
              precision    recall  f1-score   support

           0       0.81      0.90      0.85       400
           1       0.76      0.60      0.67       214

    accuracy                           0.79       614
   macro avg       0.78      0.75      0.76       614
weighted avg       0.79      0.79      0.79       614



In [518]:
lr_grid_search.fit(X_train_scaled, y_train)
lr_prediction = lr_model.predict(X_test_scaled)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


## SVM

In [519]:
from sklearn.svm import SVC

In [520]:
svm_param_grid = {
    'C': [1, 10],                    
    'kernel': ['rbf'],               
    'gamma': ['scale', 'auto']       
}

svm_grid_search = GridSearchCV(
    SVC(random_state=42),
    svm_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [521]:
svm_grid_search.fit(X_train, y_train)
eval_metric(svm_grid_search, X_train, y_train, X_test, y_test)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Test_Set
[[85 15]
 [28 26]]
              precision    recall  f1-score   support

           0       0.75      0.85      0.80       100
           1       0.63      0.48      0.55        54

    accuracy                           0.72       154
   macro avg       0.69      0.67      0.67       154
weighted avg       0.71      0.72      0.71       154


Train_Set
[[365  35]
 [ 94 120]]
              precision    recall  f1-score   support

           0       0.80      0.91      0.85       400
           1       0.77      0.56      0.65       214

    accuracy                           0.79       614
   macro avg       0.78      0.74      0.75       614
weighted avg       0.79      0.79      0.78       614



## Decision Tree

In [522]:
from sklearn.tree import DecisionTreeClassifier

In [523]:
dt_model = DecisionTreeClassifier(
    max_depth=5,                
    min_samples_leaf=5,         
    random_state=42
)

In [524]:
dt_model.fit(X_train_scaled, y_train)
eval_metric(dt_model, X_train, y_train, X_test, y_test)

Test_Set
[[99  1]
 [54  0]]
              precision    recall  f1-score   support

           0       0.65      0.99      0.78       100
           1       0.00      0.00      0.00        54

    accuracy                           0.64       154
   macro avg       0.32      0.49      0.39       154
weighted avg       0.42      0.64      0.51       154


Train_Set
[[398   2]
 [212   2]]
              precision    recall  f1-score   support

           0       0.65      0.99      0.79       400
           1       0.50      0.01      0.02       214

    accuracy                           0.65       614
   macro avg       0.58      0.50      0.40       614
weighted avg       0.60      0.65      0.52       614



## Random Forest

In [525]:
from sklearn.ensemble import RandomForestClassifier

In [526]:
rf_model = RandomForestClassifier(
    n_estimators=100,
    oob_score=True,           
    random_state=42
)

In [527]:
rf_model.fit(X_train_scaled, y_train)
eval_metric(rf_model, X_train, y_train, X_test, y_test)

Test_Set
[[  0 100]
 [  0  54]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       100
           1       0.35      1.00      0.52        54

    accuracy                           0.35       154
   macro avg       0.18      0.50      0.26       154
weighted avg       0.12      0.35      0.18       154


Train_Set
[[  0 400]
 [  0 214]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       400
           1       0.35      1.00      0.52       214

    accuracy                           0.35       614
   macro avg       0.17      0.50      0.26       614
weighted avg       0.12      0.35      0.18       614



## XGBoost

In [528]:
#!pip install xgboost
from xgboost import XGBClassifier

In [529]:
X_train_val, X_val, y_train_val, y_val = train_test_split(
    X_train_scaled, y_train, 
    test_size=0.2,  
    random_state=42,
)

xgb_model = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    early_stopping_rounds=20
)

In [530]:
xgb_model.fit(
    X_train_val, 
    y_train_val,
    eval_set=[(X_val, y_val)],
    verbose=False
)
eval_metric(xgb_model, X_train, y_train, X_test, y_test)

Test_Set
[[  0 100]
 [  0  54]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       100
           1       0.35      1.00      0.52        54

    accuracy                           0.35       154
   macro avg       0.18      0.50      0.26       154
weighted avg       0.12      0.35      0.18       154


Train_Set
[[  0 400]
 [  0 214]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       400
           1       0.35      1.00      0.52       214

    accuracy                           0.35       614
   macro avg       0.17      0.50      0.26       614
weighted avg       0.12      0.35      0.18       614



