In [4]:
import pandas as pd
import pylab as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [5]:
df = pd.read_csv("data/Diabetes.csv")
df.head(5)

Unnamed: 0,preg_count,glucose_concentration,blood_pressure,skin_thickness,serum_insulin,bmi,pedigree_function,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
X = df.iloc[:,:8].values     # independent variables
y = df['class'].values     # dependent variables

In [7]:
# Normalize Data
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2017)

In [9]:
clf = LogisticRegression(random_state=2017)
clf.fit(X_train, y_train)

In [11]:
# evaluate model
print ("Train Score: ", clf.score(X_train, y_train))
print ("Test Score: ", clf.score(X_test, y_test))

Train Score:  0.7728119180633147
Test Score:  0.7922077922077922


***Now let's try 5-fold cross-validation, to see if the accuracy holds up more rigorously***

In [14]:
train_scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
test_scores = cross_val_score(clf, X_test, y_test, scoring='accuracy', cv=5)
print ("Train Fold AUC Scores: ", train_scores)
print ("Train CV AUC Score: ", train_scores.mean())

print ("\nTest Fold AUC Scores: ", test_scores)
print ("Test CV AUC Score: ", test_scores.mean())

Train Fold AUC Scores:  [0.80555556 0.73148148 0.81308411 0.76635514 0.71028037]
Train CV AUC Score:  0.7653513326410523

Test Fold AUC Scores:  [0.85106383 0.7826087  0.80434783 0.76086957 0.7826087 ]
Test CV AUC Score:  0.7962997224791859


***Stratified k-fold cross-validation***

In [15]:
from sklearn.model_selection import StratifiedKFold

In [17]:
stratified_kfold = StratifiedKFold(n_splits=5, random_state=2017, shuffle=True)

In [21]:
train_scores = []
test_scores = []
for k, (train, test) in enumerate(stratified_kfold.split(X_train, y_train)):
    clf.fit(X_train[train], y_train[train])
    train_score = clf.score(X_train[train], y_train[train])
    test_score = clf.score(X_train[test], y_train[test])
    train_scores.append(train_score)
    test_scores.append(test_score)

    print('Fold: %s, Class dist.: %s, Train Acc: %.3f, Test Acc: %.3f' 
          % (k+1, np.bincount(y_train[train]), train_score, test_score))
        
print('\nTrain CV accuracy: %.3f' % (np.mean(train_scores)))
print('Test CV accuracy: %.3f' % (np.mean(test_scores)))

Fold: 1, Class dist.: [277 152], Train Acc: 0.767, Test Acc: 0.778
Fold: 2, Class dist.: [277 152], Train Acc: 0.774, Test Acc: 0.796
Fold: 3, Class dist.: [278 152], Train Acc: 0.777, Test Acc: 0.729
Fold: 4, Class dist.: [278 152], Train Acc: 0.767, Test Acc: 0.794
Fold: 5, Class dist.: [278 152], Train Acc: 0.779, Test Acc: 0.729

Train CV accuracy: 0.773
Test CV accuracy: 0.765
