In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score, KFold
from timeit import default_timer as timer
import time
from statistics import *
from sklearn.metrics import matthews_corrcoef
import warnings
import math
warnings.filterwarnings('ignore')
import numpy as np

In [9]:
path = r"/Users/nasim/Desktop/data/Metaprotein_50.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,25,36,81,128,171,237,1251,1254,2499,13,...,370,698,702,827,838,1227,1645,21,23,Patient_Type
0,151,6,28,21,15,15,8,6,30,38,...,2,379,167,26,50,6,42,82,0,C
1,41,10,52,37,21,16,11,4,99,23,...,5,306,136,8,7,3,26,30,4,C
2,23,19,37,13,7,8,5,1,108,1001,...,0,128,64,2,10,2,0,33,0,C
3,286,8,87,21,14,12,2,2,79,749,...,6,281,136,4,27,1,24,35,0,C
4,34,29,59,65,39,6,3,2,37,215,...,13,159,0,12,23,0,18,57,0,C


In [10]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [11]:
# Python program to get average of a list
def average_list(lst):
    return mean(lst)

In [12]:
# Cross Validation 20 folds : Gini
def create_dt_gini(criter='gini'):
    
    a = timer()
    kf = KFold(n_splits=10)
    scores = []
    mcc_scores = []
    
    dt = DecisionTreeClassifier(criterion=criter)
    
    
    for train_index, test_index in kf.split(X):
        #print("Train index: {0}, \nTest index: {1}".format(train_index, test_index))
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        dt.fit(X_train, y_train)
        
        y_pred = dt.predict(X_test)
        
        mcc_scores.append(matthews_corrcoef(y_test, y_pred))
        
        scores.append(dt.score(X_test, y_test))
        
    b = timer()
    

    delta = b - a
    
    accuracy = str(np.mean(scores))
    generation_time = str(delta)
    mcc_avg = average_list(mcc_scores)
    return accuracy,generation_time, mcc_avg

In [13]:
accuracies = []
times = []
mcc_total = []
for _ in range(30):
    #change criterion to 'entropy' for info gain calculation and 'gini' for gini index calculation
    acc, gtime, mcc_av = create_dt_gini(criter = "gini")
    accuracies.append(acc)
    times.append(gtime)
    mcc_total.append(mcc_av)
    

conveted_accuracies = [float(x) for x in accuracies]
converted_times =  [float(x) for x in times]
converted_mcc =  [float(x) for x in mcc_total]

avg_accuracy = average_list(conveted_accuracies)
avg_time = average_list(converted_times)
avg_mcc = average_list(converted_mcc)

print('*'*50)
print("Evaluating for Gini Index")
print('Accuracy: {} %'.format(avg_accuracy * 100))
print('Mcc: {}'.format(avg_mcc))
print('Average generation time : {} sec'.format(avg_time))
print('*'*50)

**************************************************
Evaluating for Gini Index
Accuracy: 64.73333333333333 %
Mcc: 0.010212852078166442
Average generation time : 0.10021609583333296 sec
**************************************************


In [14]:
accuracies = []
times = []
mcc_total = []
for _ in range(30):
    #change criterion to 'entropy' for info gain calculation and 'gini' for gini index calculation
    acc, gtime, mcc_av = create_dt_gini(criter = "entropy")
    accuracies.append(acc)
    times.append(gtime)
    mcc_total.append(mcc_av)
    

conveted_accuracies = [float(x) for x in accuracies]
converted_times =  [float(x) for x in times]
converted_mcc =  [float(x) for x in mcc_total]

avg_accuracy = average_list(conveted_accuracies)
avg_time = average_list(converted_times)
avg_mcc = average_list(converted_mcc)

print('*'*50)
print("Evaluating for Information Gain")
print('Accuracy: {} %'.format(avg_accuracy * 100))
print('Mcc: {}'.format(avg_mcc))
print('Average generation time : {} sec'.format(avg_time))
print('*'*50)

**************************************************
Evaluating for Information Gain
Accuracy: 62.483333333333334 %
Mcc: 0.009986150808861966
Average generation time : 0.07932493206666796 sec
**************************************************
