In [70]:
#Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

In [None]:
#Importing imputed dataset
diabetes = pd.read_csv('/content/drive/MyDrive/data/Imputed Dataset.csv', index_col=0)
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,218.903553,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,70.314661,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,21.542781,268.507178,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


## Logistic Regression

In [None]:
#Defining variables
X = diabetes[diabetes.columns[:-1]]
y = diabetes['Outcome']

In [None]:
#Importing logistic regression library
from sklearn.linear_model import LogisticRegression

#Importing library for train and test set
from sklearn.model_selection import train_test_split

#Defining train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0, stratify = y)

In [None]:
#Standardizing train and test split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Represent model
diabetes_logit = LogisticRegression(solver='lbfgs', random_state=0, max_iter=10000)

#Fit model
diabetes_logit.fit(X_train, y_train)

In [None]:
#Accuracy for both train and test set
print(f'Train accuracy: {diabetes_logit.score(X_train, y_train)}')
print(f'Test Accuracy: {diabetes_logit.score(X_test, y_test)}')

Train accuracy: 0.7760416666666666
Test Accuracy: 0.7552083333333334


In [None]:
#Checking predictions of model
logit_pred = diabetes_logit.predict(X_test)
logit_pred

array([0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
       1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       1., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
       0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 1.])

In [None]:
#Classification Report
from sklearn.metrics import classification_report
print(classification_report(logit_pred, y_test))

              precision    recall  f1-score   support

         0.0       0.87      0.78      0.82       140
         1.0       0.54      0.69      0.61        52

    accuracy                           0.76       192
   macro avg       0.70      0.74      0.71       192
weighted avg       0.78      0.76      0.76       192



## Decision Trees

In [None]:
#Import decision tree library
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
#Represent model
tree_model = tree.DecisionTreeClassifier(random_state=0)

#Fit model
tree_model = tree_model.fit(X_train, y_train)

In [None]:
#Defining prediction for initial model
tree_pred = tree_model.predict(X_test)
tree_pred

array([0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0.,
       0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 1., 1., 0.,
       0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1.,
       1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1.,
       0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 0.])

In [None]:
#Score model
print(f'Train accuracy: {tree_model.score(X_train, y_train)}')
print(f'Test Accuracy: {tree_model.score(X_test, y_test)}')

Train accuracy: 1.0
Test Accuracy: 0.6875


In [None]:
#Importing classification report
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

         0.0       0.75      0.78      0.76       125
         1.0       0.56      0.52      0.54        67

    accuracy                           0.69       192
   macro avg       0.65      0.65      0.65       192
weighted avg       0.68      0.69      0.69       192



## Random Forest

In [None]:
#Importing libraries for Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, classification_report

In [None]:
#Represent model
rf_model = RandomForestClassifier(random_state=0)

#Fit model
rf_model.fit(X_train, y_train)

#Prediction Accuracy
rf_pred = rf_model.predict(X_test)

test_accuracy = accuracy_score(y_test, rf_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.7604
