In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
%matplotlib inline

# Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

np.random.seed(2017)  # seed to shuffle the train set


In [2]:
# read the data in
df = pd.read_csv("data/Diabetes.csv")

X = df.iloc[:,0:8] # independent variables
y = df['class'].values     # dependent variables

In [3]:
from sklearn.model_selection import StratifiedKFold
#Normalize
X = StandardScaler().fit_transform(X)
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2017)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
num_trees = 10
verbose = True # to print the progress

clfs = [KNeighborsClassifier(),
        RandomForestClassifier(n_estimators=num_trees, random_state=2017),
        GradientBoostingClassifier(n_estimators=num_trees, random_state=2017)]

# Creating train and test sets for blending
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))

print('5-fold cross validation:\n')
for i, clf in enumerate(clfs):   
    scores = cross_val_score(clf, X_train, y_train, cv=kfold, scoring='accuracy')
    print("##### Base Model %0.0f #####" % i)
    print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
    clf.fit(X_train, y_train)   
    print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_train), y_train)))
    dataset_blend_train[:,i] = clf.predict_proba(X_train)[:, 1]
    dataset_blend_test[:,i] = clf.predict_proba(X_test)[:, 1]
    print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(X_test), y_test)))

5-fold cross validation:

##### Base Model 0 #####
Train CV Accuracy: 0.72 (+/- 0.02)
Train Accuracy: 0.82 
Test Accuracy: 0.78 
##### Base Model 1 #####
Train CV Accuracy: 0.71 (+/- 0.01)
Train Accuracy: 0.98 
Test Accuracy: 0.81 
##### Base Model 2 #####
Train CV Accuracy: 0.74 (+/- 0.01)
Train Accuracy: 0.79 
Test Accuracy: 0.82 


In [7]:
print ("##### Meta Model #####")
clf = LogisticRegression()
scores = cross_val_score(clf, dataset_blend_train, y_train, cv=kfold, scoring='accuracy')
clf.fit(dataset_blend_train, y_train)
print("Train CV Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
print("Train Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_train), y_train)))
print("Test Accuracy: %0.2f " % (metrics.accuracy_score(clf.predict(dataset_blend_test), y_test))) 

##### Meta Model #####
Train CV Accuracy: 0.98 (+/- 0.00)
Train Accuracy: 0.98 
Test Accuracy: 0.81 
