In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix
%matplotlib inline

In [2]:
data = pd.read_csv('diabetes_selected.csv')

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data.drop('Outcome',axis=1))
scaled_features = scaler.transform(data.drop('Outcome',axis=1))

In [4]:
X = scaled_features
y = data['Outcome']

In [5]:
k = 5 # k in kfold cross validation 

** SVC ** 

In [6]:
from sklearn.svm import SVC

In [7]:
from sklearn.model_selection import cross_val_score, cross_val_predict

In [8]:
# best parameters: 'C': 10, 'gamma': 0.01, 'kernel': 'rbf'
svc_model = SVC(C=10, gamma=0.01, kernel='rbf')

In [9]:
predictions = cross_val_predict(svc_model, X,y, cv=k)
score = cross_val_score(svc_model, X, y, scoring='accuracy', cv = k)
accuracy = score.mean() * 100

In [10]:
print(score)

[0.75974026 0.72727273 0.77272727 0.79084967 0.76470588]


In [11]:
print("SVC accuracy: {}".format(accuracy))
print(classification_report(y,predictions))
print(confusion_matrix(y, predictions))

SVC accuracy: 76.3059163059163
             precision    recall  f1-score   support

          0       0.78      0.88      0.83       500
          1       0.71      0.54      0.62       268

avg / total       0.76      0.76      0.75       768

[[440  60]
 [122 146]]


** KNN Classifier ** 

In [12]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
knn = KNeighborsClassifier(n_neighbors=7)

In [14]:
predictions = cross_val_predict(knn, X,y, cv=k)
score = cross_val_score(knn, X, y, scoring='accuracy', cv=k)
accuracy = score.mean() * 100

In [15]:
print(score)

[0.75324675 0.73376623 0.78571429 0.79738562 0.76470588]


In [16]:
print("KNN accuracy: {}".format(accuracy))
print(classification_report(y,predictions))
print(confusion_matrix(y, predictions))

KNN accuracy: 76.69637551990492
             precision    recall  f1-score   support

          0       0.81      0.85      0.83       500
          1       0.68      0.62      0.65       268

avg / total       0.76      0.77      0.76       768

[[423  77]
 [102 166]]


** Decision Tree Classifier **

In [17]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()

In [18]:
predictions = cross_val_predict(dtree, X,y, cv=k)
score = cross_val_score(dtree, X, y, scoring='accuracy', cv = k)
accuracy = score.mean() * 100

In [19]:
print(score)

[0.72727273 0.62337662 0.63636364 0.75816993 0.74509804]


In [20]:
print("Decision Tree accuracy: {}".format(accuracy))
print(classification_report(y,predictions))
print(confusion_matrix(y, predictions))

Decision Tree accuracy: 69.80561921738392
             precision    recall  f1-score   support

          0       0.78      0.78      0.78       500
          1       0.59      0.60      0.59       268

avg / total       0.72      0.72      0.72       768

[[390 110]
 [108 160]]


** Random Forest Classifier **

In [21]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=150)

In [22]:
predictions = cross_val_predict(rfc, X,y, cv=k)
score = cross_val_score(rfc, X, y, scoring='accuracy', cv = k)
accuracy = score.mean() * 100

In [23]:
print(score)

[0.77272727 0.75324675 0.76623377 0.84313725 0.75163399]


In [24]:
print("Random Forest accuracy: {}".format(accuracy))
print(classification_report(y,predictions))
print(confusion_matrix(y, predictions))

Random Forest accuracy: 77.73958068075714
             precision    recall  f1-score   support

          0       0.80      0.87      0.84       500
          1       0.72      0.60      0.66       268

avg / total       0.77      0.78      0.77       768

[[436  64]
 [106 162]]


** Logistic Regression  **

In [25]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()

In [26]:
predictions = cross_val_predict(logmodel, X,y, cv=k)
score = cross_val_score(logmodel, X, y, scoring='accuracy', cv = k)
accuracy = score.mean() * 100

In [27]:
print(score)

[0.75974026 0.74025974 0.78571429 0.79084967 0.76470588]


In [28]:
print("Logistic Regression accuracy: {}".format(accuracy))
print(classification_report(y,predictions))
print(confusion_matrix(y, predictions))

Logistic Regression accuracy: 76.82539682539682
             precision    recall  f1-score   support

          0       0.79      0.88      0.83       500
          1       0.71      0.56      0.63       268

avg / total       0.76      0.77      0.76       768

[[439  61]
 [117 151]]


** Gaussian Naive Bayes  **

In [29]:
from sklearn.naive_bayes import GaussianNB
gnbmodel = GaussianNB()

In [30]:
predictions = cross_val_predict(gnbmodel, X,y, cv=k)
score = cross_val_score(gnbmodel, X, y, scoring='accuracy', cv = k)
accuracy = score.mean() * 100

In [31]:
print(score)

[0.76623377 0.7012987  0.75974026 0.80392157 0.76470588]


In [32]:
print("Logistic Regression accuracy: {}".format(accuracy))
print(classification_report(y,predictions))
print(confusion_matrix(y, predictions))

Logistic Regression accuracy: 75.91800356506239
             precision    recall  f1-score   support

          0       0.80      0.83      0.82       500
          1       0.67      0.62      0.64       268

avg / total       0.76      0.76      0.76       768

[[416  84]
 [101 167]]
