In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score, KFold
from timeit import default_timer as timer
import time
from statistics import *
from sklearn.metrics import matthews_corrcoef
import warnings
import math
warnings.filterwarnings('ignore')
import numpy as np

In [2]:
path = r"/Users/nasim/Desktop/data/HeartFailurePrediction.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [3]:
X = df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']]
y = df['DEATH_EVENT']

In [4]:
# Python program to get average of a list
def average_list(lst):
    return mean(lst)

In [5]:
# Cross Validation 20 folds : Gini
def create_dt_gini(criter='gini'):
    a = timer()
    
    kf = KFold(n_splits=20)
    scores = []
    mcc_scores = []
    
    dt = DecisionTreeClassifier(criterion=criter)
    target_names = ['0', '1',]
    
    for train_index, test_index in kf.split(X):
        #print("Train index: {0}, \nTest index: {1}".format(train_index, test_index))
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        dt.fit(X_train, y_train)
        
        y_pred = dt.predict(X_test)
        
        mcc_scores.append(matthews_corrcoef(y_test, y_pred))
        
        scores.append(dt.score(X_test, y_test))
        
    b = timer()
    

    delta = b - a
    
    
    accuracy = str(np.mean(scores))
    generation_time = str(delta)
    mcc_avg = average_list(mcc_scores)
    return accuracy,generation_time, mcc_avg

In [6]:
accuracies = []
times = []
mccs = []
for _ in range(30):
    #change criterion to 'entropy' for info gain calculation and 'gini' for gini index calculation
    acc, gtime, mcc_av = create_dt_gini(criter = "gini")
    accuracies.append(acc)
    times.append(gtime)
    mccs.append(mcc_av)

conveted_accuracies = [float(x) for x in accuracies]
converted_times =  [float(x) for x in times]
converted_mccs = [float(x) for x in mccs]

avg_accuracy = average_list(conveted_accuracies)
avg_time = average_list(converted_times)
avg_mcc = average_list(converted_mccs)

print('*'*50)
print('Evaluating for information gain')
print('Accuracy: {}'.format(avg_accuracy*100))
print('Mcc: {}'.format(avg_mcc))
print('Average generation time : {} sec'.format(avg_time))
print('*'*50)

**************************************************
Evaluating for information gain
Accuracy: 78.39682539682539
Mcc: 0.16295365677292056
Average generation time : 0.19536671266666705 sec
**************************************************


In [7]:
accuracies = []
times = []
mccs = []
for _ in range(30):
    #change criterion to 'entropy' for info gain calculation and 'gini' for gini index calculation
    acc, gtime, mcc_av = create_dt_gini(criter = "entropy")
    accuracies.append(acc)
    times.append(gtime)
    mccs.append(mcc_av)

conveted_accuracies = [float(x) for x in accuracies]
converted_times =  [float(x) for x in times]
converted_mccs = [float(x) for x in mccs]

avg_accuracy = average_list(conveted_accuracies)
avg_time = average_list(converted_times)
avg_mcc = average_list(converted_mccs)

print('*'*50)
print('Evaluating for gini')
print('Accuracy: {}'.format(avg_accuracy * 100))
print('Mcc: {}'.format(avg_mcc))
print('Average generation time : {} sec'.format(avg_time))
print('*'*50)

**************************************************
Evaluating for gini
Accuracy: 76.4984126984127
Mcc: 0.10966634018559643
Average generation time : 0.18223392336666605 sec
**************************************************


In [10]:
numerator = (13+10+9)*38-(13*13)-(16*10)-(15*9)
denominator = math.sqrt(38**2-13**2-10**2-15**2)*math.sqrt(38**2-9**2-16**2-13**2)
mcc_test = numerator/denominator
print(mcc_test)

0.7966262606577601


In [12]:
y_true = [1,1,1,1,1,1,1,1,0,0,0,0]
y_pred = [0,0,1,1,1,1,1,1,0,0,0,1]
matthews_corrcoef(y_true, y_pred)

0.47809144373375745

In [18]:
#test2
y_true = [1,1,1,2,2,2,3,3,3]
y_pred = [1,2,3,1,2,3,1,2,3]
matthews_corrcoef(y_true, y_pred)

0.0

In [19]:
confusion_matrix(y_true, y_pred)

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])