In [49]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score, KFold
from timeit import default_timer as timer
import time
from statistics import *
from sklearn.metrics import matthews_corrcoef
import warnings
import math
warnings.filterwarnings('ignore')
import numpy as np
import statistics

In [50]:
path = r"/Users/nasim/Desktop/data/Flu_Classification.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,Age,Temperature,Sex,Diarrhea,Fever,Coughing,ShortnessOfBreath,SoreThroat,NauseaVomitting,Fatigue,Cancer,Diagnosis
0,67.0,38.11,F,unknown,Yes,Yes,unknown,No,unknown,No,unknown,H1N1
1,29.0,0.0,M,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,H1N1
2,22.0,0.0,F,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,H1N1
3,20.0,36.56,F,unknown,Yes,Yes,unknown,No,unknown,Yes,unknown,H1N1
4,21.0,0.0,M,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,H1N1


In [51]:
df = df.astype(str)

In [52]:
df.Diagnosis = df.Diagnosis.astype("category").cat.codes
df.Sex = df.Sex.astype("category").cat.codes
df.Diarrhea = df.Diarrhea.astype("category").cat.codes
df.Fever = df.Fever.astype("category").cat.codes
df.Coughing = df.Coughing.astype("category").cat.codes
df.ShortnessOfBreath = df.ShortnessOfBreath.astype("category").cat.codes
df.SoreThroat = df.SoreThroat.astype("category").cat.codes
df.NauseaVomitting = df.NauseaVomitting.astype("category").cat.codes
df.Fatigue = df.Fatigue.astype("category").cat.codes
df.Cancer = df.Cancer.astype("category").cat.codes

In [53]:
df.dtypes

Age                  object
Temperature          object
Sex                    int8
Diarrhea               int8
Fever                  int8
Coughing               int8
ShortnessOfBreath      int8
SoreThroat             int8
NauseaVomitting        int8
Fatigue                int8
Cancer                 int8
Diagnosis              int8
dtype: object

In [54]:
X = df[["Age", "Temperature", "Sex", "Diarrhea", "Fever", "Coughing", "ShortnessOfBreath", "SoreThroat", "NauseaVomitting", "Fatigue", "Cancer"]]
y = df['Diagnosis']

In [55]:
# Python program to get average of a list
def average_list(lst):
    return mean(lst)

In [56]:
def create_dt_gini(criter='gini'):
    a = timer()
    
    kf = KFold(n_splits=20)
    scores = []
    mcc_scores = []
    
    dt = DecisionTreeClassifier(criterion=criter)
    
    for train_index, test_index in kf.split(X):
        #print("Train index: {0}, \nTest index: {1}".format(train_index, test_index))
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        dt.fit(X_train, y_train)
        
        y_pred = dt.predict(X_test)
        
        mcc_scores.append(matthews_corrcoef(y_test, y_pred))
        
        scores.append(dt.score(X_test, y_test))
        
    b = timer()
    

    delta = b - a
    
    accuracy = str(np.mean(scores))
    generation_time = str(delta)
    mcc_avg = average_list(mcc_scores)
    return accuracy,generation_time, mcc_avg

In [57]:
accuracies = []
times = []
mccs = []
for _ in range(30):
    acc, gtime, mcc_avg = create_dt_gini(criter='gini')
    accuracies.append(acc)
    times.append(gtime)
    mccs.append(mcc_avg)

    
conveted_accuracies = [float(x) for x in accuracies]
converted_times =  [float(x) for x in times]
converted_mccs = [float(x) for x in mccs]

avg_accuracy = average_list(conveted_accuracies)
avg_time = average_list(converted_times)
avg_mcc = average_list(converted_mccs)

print('*'*50)
print('Evaluating for Gini Index')
print('Accuracy: {}'.format(avg_accuracy*100))
print('Mcc: {}'.format(avg_mcc))
print('Average generation time : {} sec'.format(avg_time))
print('*'*50)

**************************************************
Evaluating for Gini Index
Accuracy: 86.27438438438439
Mcc: 0.0318246069611921
Average generation time : 0.21288455336667386 sec
**************************************************


In [58]:
accuracies = []
times = []
mccs = []
for _ in range(30):
    acc, gtime, mcc_avg = create_dt_gini(criter='entropy')
    accuracies.append(acc)
    times.append(gtime)
    mccs.append(mcc_avg)

    
conveted_accuracies = [float(x) for x in accuracies]
converted_times =  [float(x) for x in times]
converted_mccs = [float(x) for x in mccs]

avg_accuracy = average_list(conveted_accuracies)
avg_time = average_list(converted_times)
avg_mcc = average_list(converted_mccs)

print('*'*50)
print('Evaluating for information gain')
print('Accuracy: {}'.format(avg_accuracy*100))
print('Mcc: {}'.format(avg_mcc))
print('Average generation time : {} sec'.format(avg_time))
print('*'*50)

**************************************************
Evaluating for information gain
Accuracy: 86.40984984984983
Mcc: 0.031040048245806594
Average generation time : 0.2417791695999919 sec
**************************************************
