In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn import model_selection
# from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold

In [2]:
df = pd.read_csv('datasets/diabetes.csv') 
df.info()
# df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [3]:
x = df.drop('Outcome', axis=1).values 
y = df['Outcome'].values

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [5]:
# Evaluate using a train and a test set
print("Hold-Out 70-30")
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(x, y, test_size=0.30, random_state=100)
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, Y_train)
result_dt = model_dt.score(X_test, Y_test)
print("Accuracy of Decision Tree: %.2f%%" % (result_dt*100.0))
model_svm = SVC()
model_svm.fit(X_train, Y_train)
result_svm = model_svm.score(X_test, Y_test)
print("Accuracy of SVM: %.2f%%" % (result_svm*100.0))

Hold-Out 70-30
Accuracy of Decision Tree: 67.10%
Accuracy of SVM: 75.76%


In [6]:
print("10 Fold Cross Validation")
model_dt = DecisionTreeClassifier()
model_svm = SVC()
kfold = KFold(n_splits=10)
# model_kfold = LogisticRegression()
results_dt_kfold = model_selection.cross_val_score(model_dt, x, y, cv=kfold)
results_svm_kfold = model_selection.cross_val_score(model_svm, x, y, cv=kfold)
print("Accuracy of Decision Tree: %.2f%%" % (results_dt_kfold.mean()*100.0))
print("Accuracy of SVM %.2f%%" % (results_svm_kfold.mean()*100.0))

10 Fold Cross Validation
Accuracy of Decision Tree: 69.26%
Accuracy of SVM 76.04%


In [7]:
print("stratified 3-Fold")
model_dt = DecisionTreeClassifier()
model_svm = SVC()
skfold = StratifiedKFold(n_splits=3)
results_dt_skfold = model_selection.cross_val_score(model_dt, x, y, cv=skfold)
results_svm_skfold = model_selection.cross_val_score(model_svm, x, y, cv=skfold)
print("Accuracy of Decision Tree: %.2f%%" % (results_dt_skfold.mean()*100.0))
print("Accuracy of SVM %.2f%%" % (results_svm_skfold.mean()*100.0))

stratified 3-Fold
Accuracy of Decision Tree: 68.49%
Accuracy of SVM 75.00%


In [8]:
print("Leave One Out Cross Validation")
model_dt = DecisionTreeClassifier()
model_svm = SVC()
loocv = model_selection.LeaveOneOut()
results_dt_loocv = model_selection.cross_val_score(model_dt, x, y, cv=loocv)
results_svm_loocv = model_selection.cross_val_score(model_svm, x, y, cv=loocv)
print("Accuracy of Decision Tree: %.2f%%" % (results_dt_loocv.mean()*100.0))
print("Accuracy of SVM %.2f%%" % (results_svm_loocv.mean()*100.0))

Leave One Out Cross Validation
Accuracy of Decision Tree: 70.31%
Accuracy of SVM 76.30%


In [9]:
print("Repeated Random Sub-sampling(10 splits in 70-30) ")
model_dt = DecisionTreeClassifier()
model_svm = SVC()
kfold2 = ShuffleSplit(n_splits=10, test_size=0.30, random_state=100)
results_dt_shufflecv = model_selection.cross_val_score(model_dt, x, y, cv=kfold2)
results_svm_shufflecv = model_selection.cross_val_score(model_svm, x, y, cv=kfold2)
print("Accuracy of Decision Tree: %.2f%%" % (results_dt_shufflecv.mean()*100.0))
print("Accuracy of SVM %.2f%%" % (results_svm_shufflecv.mean()*100.0))

Repeated Random Sub-sampling(10 splits in 70-30) 
Accuracy of Decision Tree: 70.04%
Accuracy of SVM 74.37%


In [10]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
model_knn = KNeighborsClassifier()
vc = VotingClassifier(estimators=[('dtree',model_dt),('svm',model_svm),("knn",model_knn)])
print("Ensemble of Decision Tree, SVM and KNN")
print("\nHold-Out 70-30")
vc.fit(X_train, Y_train)
predictions = vc.predict(X_test)
print("Accuracy: %.2f%%" % (accuracy_score(Y_test, predictions)*100))
print("\n10 Fold Cross Validation")
results_vc_kfold = model_selection.cross_val_score(vc, x, y, cv=kfold)
print("Accuracy: %.2f%%" % (results_vc_kfold.mean()*100.0))
print("\nstratified 3-Fold")
results_vc_skfold = model_selection.cross_val_score(vc, x, y, cv=skfold)
print("Accuracy: %.2f%%" % (results_vc_skfold.mean()*100.0))
print("\nLeave One Out Cross Validation")
results_vc_loocv = model_selection.cross_val_score(vc, x, y, cv=loocv)
print("Accuracy: %.2f%%" % (results_vc_loocv.mean()*100.0))
print("\nRepeated Random Sub-sampling(10 splits in 70-30) ")
results_vc_kfold2 = model_selection.cross_val_score(vc, x, y, cv=kfold2)
print("Accuracy: %.2f%%" % (results_vc_kfold2.mean()*100.0))

Ensemble of Decision Tree, SVM and KNN

Hold-Out 70-30
Accuracy: 76.19%

10 Fold Cross Validation
Accuracy: 75.39%

stratified 3-Fold
Accuracy: 73.57%

Leave One Out Cross Validation
Accuracy: 74.35%

Repeated Random Sub-sampling(10 splits in 70-30) 
Accuracy: 73.64%
