In [7]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score, KFold
from timeit import default_timer as timer
import time
from statistics import *
from sklearn.metrics import matthews_corrcoef
import warnings
import math
warnings.filterwarnings('ignore')
import numpy as np

In [8]:
path = "diabetes_health_indicators.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [9]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [10]:
# Python program to get average of a list
def average_list(lst):
    return mean(lst)

In [11]:
# Cross Validation 20 folds : Gini
def create_dt(criter='gini'):
    
    a = timer()
    kf = KFold(n_splits=10)
    scores = []
    mcc_scores = []
    
    dt = DecisionTreeClassifier(criterion=criter)
    
    
    for train_index, test_index in kf.split(X):
        #print("Train index: {0}, \nTest index: {1}".format(train_index, test_index))
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        dt.fit(X_train, y_train)
        
        y_pred = dt.predict(X_test)
        
        mcc_scores.append(matthews_corrcoef(y_test, y_pred))
        
        scores.append(dt.score(X_test, y_test))
        
    b = timer()
    

    delta = b - a
    
    accuracy = str(np.mean(scores))
    generation_time = str(delta)
    mcc_avg = average_list(mcc_scores)
    return accuracy,generation_time, mcc_avg

In [12]:
accuracies = []
times = []
mcc_total = []
for _ in range(30):
    #change criterion to 'entropy' for info gain calculation and 'gini' for gini index calculation
    acc, gtime, mcc_av = create_dt(criter = "gini")
    accuracies.append(acc)
    times.append(gtime)
    mcc_total.append(mcc_av)
    

conveted_accuracies = [float(x) for x in accuracies]
converted_times =  [float(x) for x in times]
converted_mcc =  [float(x) for x in mcc_total]

avg_accuracy = average_list(conveted_accuracies)
avg_time = average_list(converted_times)
avg_mcc = average_list(converted_mcc)

print('*'*50)
print("Evaluating for Gini Index")
print('Accuracy: {} %'.format(avg_accuracy * 100))
print('Mcc: {}'.format(avg_mcc))
print('Average generation time : {} sec'.format(avg_time))
print('*'*50)

**************************************************
Evaluating for Gini Index
Accuracy: 28.386366025438875 %
Mcc: 0.10741120926447141
Average generation time : 24.062977612833336 sec
**************************************************


In [13]:
accuracies = []
times = []
mcc_total = []
for _ in range(30):
    #change criterion to 'entropy' for info gain calculation and 'gini' for gini index calculation
    acc, gtime, mcc_av = create_dt(criter = "entropy")
    accuracies.append(acc)
    times.append(gtime)
    mcc_total.append(mcc_av)
    

conveted_accuracies = [float(x) for x in accuracies]
converted_times =  [float(x) for x in times]
converted_mcc =  [float(x) for x in mcc_total]

avg_accuracy = average_list(conveted_accuracies)
avg_time = average_list(converted_times)
avg_mcc = average_list(converted_mcc)

print('*'*50)
print("Evaluating for Information Gain")
print('Accuracy: {} %'.format(avg_accuracy * 100))
print('Mcc: {}'.format(avg_mcc))
print('Average generation time : {} sec'.format(avg_time))
print('*'*50)


**************************************************
Evaluating for Information Gain
Accuracy: 28.417822979081258 %
Mcc: 0.10767734378187956
Average generation time : 21.784416107900007 sec
**************************************************
