In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
from sklearn import metrics
from sklearn.decomposition import PCA
from scipy.stats import zscore
import matplotlib.pyplot as plt 

In [2]:
colnames = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']
prima_df = pd.read_csv("prima-indians-diabetes.data",names=colnames)

In [3]:
X=prima_df[['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age']]
Y=prima_df['outcome']

In [16]:
sc=StandardScaler()
X=sc.fit_transform(X)

In [6]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

In [17]:
LR=LogisticRegression()
NB=GaussianNB()
bag_LR=BaggingClassifier(base_estimator=LR,n_estimators=10,random_state=0)
boost_LR=AdaBoostClassifier(base_estimator=LR,n_estimators=300,random_state=0)
#boost_bag_LR=AdaBoostClassifier(base_estimator=bag_LR,n_estimators=100,random_state=0)
boost_NB=AdaBoostClassifier(base_estimator=NB,n_estimators=100,random_state=0)
gboost=GradientBoostingClassifier(n_estimators=120,random_state=0)

In [8]:
boost_DT = AdaBoostClassifier(n_estimators=100,random_state=0)

In [9]:
RF=RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=0)

In [10]:
boost_RF=AdaBoostClassifier(base_estimator=RF,n_estimators=100,random_state=0)

In [18]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
kf=KFold(n_splits=5,shuffle=True,random_state=2)
for model, name in zip([LR,bag_LR,boost_LR,NB,boost_NB,boost_DT,RF,boost_RF,gboost],['Logistic','BaggedLR','BoostedLR','NB','BoostedNB','BoostedDT','RF','BoostedRF','GradientBoost']):
    roc_auc=[]
    for train,test in kf.split(X,Y):
        Xtrain,Xtest=X[train,:],X[test,:]
        Ytrain,Ytest=Y[train],Y[test]
        model.fit(Xtrain,Ytrain)
        Y_predict=model.predict(Xtest)
        cm=metrics.confusion_matrix(Ytest,Y_predict)
        fpr,tpr, _ = roc_curve(Ytest,Y_predict)
        roc_auc.append(auc(fpr, tpr))
    print("AUC scores: %0.02f (+/- %0.5f) [%s]" % (np.mean(roc_auc), np.var(roc_auc,ddof=1), name ))   
    

AUC scores: 0.72 (+/- 0.00093) [Logistic]
AUC scores: 0.72 (+/- 0.00048) [BaggedLR]
AUC scores: 0.72 (+/- 0.00086) [BoostedLR]
AUC scores: 0.72 (+/- 0.00219) [NB]
AUC scores: 0.55 (+/- 0.00342) [BoostedNB]
AUC scores: 0.70 (+/- 0.00325) [BoostedDT]
AUC scores: 0.72 (+/- 0.00032) [RF]
AUC scores: 0.72 (+/- 0.00135) [BoostedRF]
AUC scores: 0.72 (+/- 0.00128) [GradientBoost]


In [11]:
stacked = VotingClassifier(estimators = [('Boosted_LR',boost_LR),('RF', base_rf), ('Boosted_DT', boost_dt)],voting='soft')

In [12]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [13]:
from sklearn.model_selection import KFold
kf=KFold(n_splits=5,shuffle=True,random_state=2)
for model, name in zip([boost_LR,base_rf,boost_dt,gb_model,stacked], ['BoostLR','RF','BoostedDT','GradientBoost','stacked']):
    k=0
    recall=np.zeros((2,5))
    prec=np.zeros((2,5))
    fscore=np.zeros((2,5))
    for train,test in kf.split(X,Y):
        Xtrain,Xtest=X[train,:],X[test,:]
        Ytrain,Ytest=Y[train],Y[test]
        model.fit(Xtrain,Ytrain)
        Y_predict=model.predict(Xtest)
        cm=metrics.confusion_matrix(Ytest,Y_predict)
        for i in np.arange(0,2):
            recall[i,k]=cm[i,i]/cm[i,:].sum()
        for i in np.arange(0,2):
            prec[i,k]=cm[i,i]/cm[:,i].sum()
        k=k+1
    for row in np.arange(0,2):
        for col in np.arange(0,5):
            fscore[row,col]=2*(recall[row,col]*prec[row,col])/(recall[row,col]+prec[row,col])
    print("f1_weighted for Healthy: %0.02f (+/- %0.5f) [%s]" % (np.mean(fscore[0,:]), np.var(fscore[0,:],ddof=1), name ))   
    print("f1_weighted for Diabetic: %0.02f (+/- %0.5f) [%s]" % (np.mean(fscore[1,:]), np.var(fscore[1,:],ddof=1), name ))   
    

f1_weighted for Healthy: 0.83 (+/- 0.00048) [BoostLR]
f1_weighted for Diabetic: 0.62 (+/- 0.00222) [BoostLR]
f1_weighted for Healthy: 0.81 (+/- 0.00029) [RF]
f1_weighted for Diabetic: 0.62 (+/- 0.00171) [RF]
f1_weighted for Healthy: 0.81 (+/- 0.00028) [BoostedDT]
f1_weighted for Diabetic: 0.60 (+/- 0.00683) [BoostedDT]
f1_weighted for Healthy: 0.82 (+/- 0.00037) [GradientBoost]
f1_weighted for Diabetic: 0.62 (+/- 0.00248) [GradientBoost]
f1_weighted for Healthy: 0.81 (+/- 0.00020) [stacked]
f1_weighted for Diabetic: 0.62 (+/- 0.00166) [stacked]


In [14]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
kf=KFold(n_splits=5,shuffle=True,random_state=2)
for model, name in zip([boost_LR,base_rf,boost_dt,gb_model,stacked], ['BoostLR','RF','BoostedDT','GradientBoost','stacked']):
    roc_auc=[]
    for train,test in kf.split(X,Y):
        Xtrain,Xtest=X[train,:],X[test,:]
        Ytrain,Ytest=Y[train],Y[test]
        model.fit(Xtrain,Ytrain)
        Y_predict=model.predict(Xtest)
        cm=metrics.confusion_matrix(Ytest,Y_predict)
        fpr,tpr, _ = roc_curve(Ytest,Y_predict)
        roc_auc.append(auc(fpr, tpr))
    print("AUC scores: %0.02f (+/- %0.5f) [%s]" % (np.mean(roc_auc), np.var(roc_auc,ddof=1), name ))   
    

AUC scores: 0.72 (+/- 0.00086) [BoostLR]
AUC scores: 0.71 (+/- 0.00047) [RF]
AUC scores: 0.70 (+/- 0.00229) [BoostedDT]
AUC scores: 0.71 (+/- 0.00093) [GradientBoost]
AUC scores: 0.71 (+/- 0.00042) [stacked]
