### Decision Tree Algorithms on Credit Card Fraud Detection

In [2]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd  
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
import collections
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

df= pd.read_csv("/Users/navnigupta/Downloads/creditcard.csv")
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
pd.DataFrame(X)
X.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


### Implementing Under Sampling

In this we have done under sampling to deal with the highly imbalanced dataset. So under sampling gives equal number of majority and minority class samples which are equal to number of minority class samples of the original dataset.<br>
Here we have two classes fraudelent and non-fraudalent where fraudelent is the minority class represented by 1 and non-fraudalent is majority class represented by 0.


In [2]:
rus = RandomUnderSampler(random_state=0)
X_resampled_U, y_resampled_U = rus.fit_resample(X, y)
X_resampled_df1=pd.DataFrame(X_resampled_U)
y_resampled_df1=pd.DataFrame(y_resampled_U)
y_resampled_df1=y_resampled_df1.rename(columns={0:30})
df_new1=pd.concat([X_resampled_df1, y_resampled_df1], axis=1)
print(sorted(collections.Counter(y_resampled_U).items()))
df_new1.columns=df.keys()
df_new1.head()

[(0, 492), (1, 492)]


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,162183.0,2.049094,0.186189,-1.707198,0.530768,0.160589,-1.44857,0.23931,-0.353611,0.634425,...,0.197782,0.741141,-0.009744,-0.085057,0.228384,-0.097292,-0.001028,-0.03239,2.99,0
1,120967.0,2.12554,-0.030714,-1.527653,0.121046,0.543172,-0.347988,0.157221,-0.229126,0.477999,...,-0.336497,-0.838932,0.275173,0.049145,-0.156765,0.205919,-0.072321,-0.059009,1.98,0
2,26484.0,-4.155859,-5.705748,0.274699,-0.993262,-6.059393,5.210848,5.811316,0.367888,1.75071,...,1.371671,1.195815,4.188762,-1.091077,1.033044,0.224493,-0.486741,0.194275,1937.66,0
3,65751.0,-0.56642,-0.579576,0.823503,-1.45124,-0.583587,0.206381,1.601392,-0.370446,-1.910354,...,-0.065082,-0.761357,0.641524,-0.568974,-0.053164,-0.690995,-0.22863,-0.157254,320.05,0
4,137025.0,0.060858,-0.261762,-1.699493,-1.202327,3.699527,3.196249,0.437208,0.421541,0.492435,...,0.008303,0.534602,0.089602,0.667918,0.017798,0.611584,-0.469946,-0.51437,11.5,0


In [3]:
X_new=pd.DataFrame(X_resampled_U,columns=X.keys())
X_new.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,162183.0,2.049094,0.186189,-1.707198,0.530768,0.160589,-1.44857,0.23931,-0.353611,0.634425,...,-0.232666,0.197782,0.741141,-0.009744,-0.085057,0.228384,-0.097292,-0.001028,-0.03239,2.99
1,120967.0,2.12554,-0.030714,-1.527653,0.121046,0.543172,-0.347988,0.157221,-0.229126,0.477999,...,-0.125804,-0.336497,-0.838932,0.275173,0.049145,-0.156765,0.205919,-0.072321,-0.059009,1.98
2,26484.0,-4.155859,-5.705748,0.274699,-0.993262,-6.059393,5.210848,5.811316,0.367888,1.75071,...,3.944592,1.371671,1.195815,4.188762,-1.091077,1.033044,0.224493,-0.486741,0.194275,1937.66
3,65751.0,-0.56642,-0.579576,0.823503,-1.45124,-0.583587,0.206381,1.601392,-0.370446,-1.910354,...,0.977739,-0.065082,-0.761357,0.641524,-0.568974,-0.053164,-0.690995,-0.22863,-0.157254,320.05
4,137025.0,0.060858,-0.261762,-1.699493,-1.202327,3.699527,3.196249,0.437208,0.421541,0.492435,...,-0.01455,0.008303,0.534602,0.089602,0.667918,0.017798,0.611584,-0.469946,-0.51437,11.5


### Spliting data into test and train data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y_resampled_U, test_size=0.33, random_state=42)

### CART Implementation

In [5]:
max_test_score = 0
MaxDepth=[]
TestScore=[]
TrainScore=[]
for i in range(1, 20):
    MaxDepth.append(i)
    clf_dt = DecisionTreeClassifier(max_depth=i)
    clf_dt.fit(X_train, y_train)
    train_score = clf_dt.score(X_train, y_train)
    test_score = clf_dt.score(X_test, y_test)
    TestScore.append(test_score)
    TrainScore.append(train_score)
    if test_score > max_test_score:
        related_train_score = train_score
        max_test_score = test_score
        max_i = i
        best_clf_dt = clf_dt
print("depth: ", max_i, "train: ", related_train_score)
print("depth: ", max_i, "test: ", max_test_score)

depth:  3 train:  0.9484066767830045
depth:  3 test:  0.9261538461538461


In [6]:
clf_gini = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5, max_features=None, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=100, splitter='best')
clf_gini.fit(X_train, y_train)
print("The train score is",clf_gini.score(X_train, y_train))

The train score is 0.9590288315629742


In [7]:
y_pred_en_gini = clf_gini.predict(X_test)
y_pred_en_gini

array([1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0,

In [8]:
print(confusion_matrix(y_test, y_pred_en_gini))  
print("\t\tClassification Report for CART")
print(classification_report(y_test, y_pred_en_gini)) 

[[152  12]
 [ 16 145]]
		Classification Report for CART
              precision    recall  f1-score   support

           0       0.90      0.93      0.92       164
           1       0.92      0.90      0.91       161

   micro avg       0.91      0.91      0.91       325
   macro avg       0.91      0.91      0.91       325
weighted avg       0.91      0.91      0.91       325



### C4.5 Implementation

In [9]:
import random
parent_node = None
# the node class that will make up the tree
class decisionTreeNode():
    def __init__(self, is_leaf_node, classification, attribute_split_value, parent, left_child, right_child, height):

        self.classification = None
        self.attribute_split = None
        self.attribute_split_value = None
        self.parent = parent
        self.left_child = None
        self.right_child = None
        self.height = None
        self.is_leaf_node = True



#Split the data based on the feature and a value to data above and data below
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]
    
    return data_below, data_above

#Get all the boundary values for each features (Key is feature and values are the splits)
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):        # excluding the last column which is the label
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                
                potential_splits[column_index].append(potential_split)
        if (unique_values.shape[0] == 1):
            potential_split = unique_values[index]
            
            potential_splits[column_index].append(potential_split)

    
    return potential_splits

#Calculates Entropy of the data given
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

#Calculates the entropy of data below and data above
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

#Check if all data is of same class
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

#Classify data based on majority
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

#Gives the best feature and its split value after checking all features based on gain ratio
def determine_best_split(data, potential_splits):
    
    entropy_label = calculate_entropy(data)   
    overall_gain = -1.0
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
            current_information_gain = entropy_label - current_overall_entropy
            current_splitting_info = splitting_information(data_below,data_above)
            if current_splitting_info == 0:
                current_gain_ratio = 0
            else:
                current_gain_ratio = float(current_information_gain / current_splitting_info)

            if current_gain_ratio >= overall_gain:
                overall_gain = current_gain_ratio
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

#Calculates the splitting Info of data above and below for that boundary value
def splitting_information(data_below,data_above):
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below)/ n
    p_data_above = len(data_above) / n

    if p_data_below == 0:
        splitting_info = p_data_above * np.log2(p_data_above)
    elif p_data_above == 0:
        splitting_info = p_data_below * np.log2(p_data_below)
    else:
        splitting_info = -p_data_below * np.log2(p_data_below) -p_data_above * np.log2(p_data_above) 
    
    return splitting_info

def decision_tree_algorithm(df, parent_node,counter=0, min_samples=3):
    node = decisionTreeNode(True, None, None, parent_node, None, None, 0)
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df           
    
    
    # base cases
    if (check_purity(data)) or (len(data) < min_samples):
        classification = classify_data(data)
        node.is_leaf_node = True
        node.classification = classification
        return node

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        node.is_leaf_node = False
        # instantiate sub-tree
        feature_name = COLUMN_HEADERS[split_column]
        
        question = "{} <= {}".format(feature_name, split_value)


       
        if (parent_node == None):
            node.height = 0
        else:
            node.parent = parent_node
            node.height = node.parent.height + 1


        node.attribute_split = feature_name
        node.attribute_split_value = split_value

        # find answers (recursion)
        node.left_child = decision_tree_algorithm(data_below,node, counter, min_samples)
        node.right_child = decision_tree_algorithm(data_above,node, counter, min_samples)
        

        return node

def get_paths(root, path, pathlen,all_paths,val):
    if (root==None):
        return
    
    if root.is_leaf_node == True: 
        path.append(root.classification) 
    else:
        path.append('row[\'' + root.attribute_split + '\']' + val + str(root.attribute_split_value))
        
    pathlen= pathlen+1
    if (root.left_child == None and root.right_child == None): # If leaf, append current path
        add = path[:]
        all_paths.append(add)
        path.pop()
        root = root.parent
    else:
        get_paths(root.left_child, path, pathlen,all_paths,' <= ')
        path[pathlen-1]= 'row[\'' + root.attribute_split + '\']' +' > ' + str(root.attribute_split_value)
        get_paths(root.right_child, path,pathlen,all_paths,' <= ')
        path.pop()

    return all_paths

def classify_test_data(root,data):
    predictions = []
    tree = root 
    data = data.iloc[:, :-1]
    for index, sample in data.iterrows():
        root = tree
        while(tree.is_leaf_node!=True):
            if (sample.loc[tree.attribute_split] <= tree.attribute_split_value):
                tree = tree.left_child
            else:
                tree = tree.right_child
        predictions.append(tree.classification)
        tree = root

    return predictions

def calc_accuracy_rule(rule,test):
    wrong = 0
#Check how many classified correctly
    for index, row in test.iterrows():
        s=0
        while(s<len(rule)-1):
            if (eval(rule[s])== False):
                wrong += 1
                break
            s=s+1
    #Initial Accuracy of one Rule before pruning
    accuracy = (test.shape[0]-wrong) / test.shape[0]
    return accuracy


def recursive_len(item):
    if type(item) == list:
        return sum(recursive_len(subitem) for subitem in item)
    else:
        return 1
    
def prune(all_rules,val_data):
    acc_rlist = []
    maping = []
    rulenos = []
    #What are the labels in my val data
    ctoprune = val_data['Class'].unique()
#     Loop at all rules one by one
    size_of_rules = recursive_len(all_rules)
    print("before pruning:",size_of_rules)
    for i in range(len(all_rules)):
        init_accuracy = 0
        #Loop only on the rules applicable to my valset
        if all_rules[i][-1] in ctoprune:
                #Get the label of the Rule
                label = all_rules[i][-1]
                #Get all samples for that label
                test = val_data[val_data['Class']==label]
                #Check Initial Accuracy of the rule
                init_accuracy = calc_accuracy_rule(all_rules[i],test)
                
                temp = all_rules[i][:]
                pruned_accuracy = -1
                while (init_accuracy!=pruned_accuracy):
                # if (init_accuracy!=pruned_accuracy):
                    for x in range(len(all_rules[i])-1):
                        del temp[x]
                        accuracy = calc_accuracy_rule(temp,test)
                        if accuracy > init_accuracy:
                            delx = x
                            init_accuracy = accuracy
                        temp = all_rules[i][:]
                    # Ensure variable is defined
                    try:
                        delx
                    except NameError:
                        delx = None

                    if delx is not None:
                        del all_rules[i][delx]
                        del delx
                        # pruned_accuracy = init_accuracy
                        if (len(all_rules[i])== 2):
                            pruned_accuracy = init_accuracy
                    else:
                        pruned_accuracy = init_accuracy
        else:
            pruned_accuracy = init_accuracy
        acc_rlist.append(pruned_accuracy)
        rulenos.append(i)
    maping.append(acc_rlist)
    maping.append(rulenos)
    size_of_rules = recursive_len(all_rules)
    print("before pruning:",size_of_rules)
    maping= np.array(maping)
    maping = pd.DataFrame(maping.T)
    maping = maping.sort_values(0,ascending=False)
    maping = pd.DataFrame(maping)
    return all_rules,maping
                
def predict_prunedtree(all_rules,test_data,maping):
    answer = []
    unclassified = 0
    for index, row in test_data.iterrows():
        for indo,valus in maping.iterrows():
            rule = all_rules[int(valus[1])]
            s=0
            count = 0
            while(s<len(rule)-1):
                if (eval(rule[s])== True):
                    count = count + 1
                s = s+1
            if (count == len(rule)-1):
                prediction = rule[-1]
                break
        try:
            prediction
        except NameError:
            prediction = None
        if prediction is not None:
            answer.append(prediction)
            del prediction
        else:
            unclassified = unclassified + 1
#     print('Unclassified Sample',unclassified)
    return answer

def predict_preprunedtree(all_rules,test_data):
    answer = []
    unclassified = 0
    for index, row in test_data.iterrows():
        # prediction = row[-1]
        for rule in all_rules:
            s=0
            count = 0
            while(s<len(rule)-1):
                if (eval(rule[s])== True):
                    count = count + 1
                s = s+1
            if (count == len(rule)-1):
                prediction = rule[-1]
                break
        try:
            prediction
        except NameError:
            prediction = None
        if prediction is not None:
            answer.append(prediction)
            del prediction
        else:
            unclassified = unclassified + 1
#     print('Unclassified Sample',unclassified)
    return answer

def add_noise2(num,data):

    siz_d = data.shape[0]
    indx = int((num * siz_d)/100)
    for x in range(indx):
        pick = random.randint(0,int(siz_d/2))
        label = data.iloc[pick:,-1].values[0]
        if label > 0:
            data.iloc[pick:,-1] = data.iloc[pick:,-1] - 1
        else:
            data.iloc[pick:,-1] = data.iloc[pick:,-1] + 1
    return data

def add_noise1(num,data):
    siz_d = data.shape[0]
    indx = int((num * siz_d)/100)
    count = 0
    for x in range(indx):
        count = count+1
        pick = random.randint(0,siz_d)
        label = data.iloc[pick:,-1].values[0]
        if label > 0:
            label = label + 1
        else:
            label = label - 1
        last = data.shape[0]  
        data = data.append(data.iloc[pick,:])
        data.iloc[last,-1] = label
    return data

noise1_5 = []
noise1_10 = []
noise1_15 = []
noise2_5 = []
noise2_10 = []
noise2_15 = []

if __name__ == "__main__":
    meanacc= []
    meanf1_score= []
    meanrecall_score= []
    maping = []
    mean_bprun = []
    rkf = RepeatedKFold(n_splits=10, n_repeats=2, random_state=2652124)
    for train_index, test_index in rkf.split(df_new1):
        parent_node = None
        test_data = df_new1.iloc[test_index]
        train_data,val_data= train_test_split(df_new1.iloc[train_index], test_size=0.2)
        
        tree = decision_tree_algorithm(train_data,parent_node)
        all_rules = get_paths(tree,[],0,[],' <= ')


        
# #         #Validation
# #         y_true= val_data.iloc[:,-1].values
# #         y_pred = classify_test_data(tree,val_data)
# #         accuracy = accuracy_score(y_true, y_pred)
# #         print('Pre-Pruning Accuracy of Val data With Just Tree',accuracy)
        
#         # # Pre-Pruning Accuracy of Test Data With Tree
#         # y_true= test_data.iloc[:,-1].values
#         # y_pred = classify_test_data(tree,test_data)
#         # accuracy = accuracy_score(y_true, y_pred)
#         # print('Pre-Pruning Accuracy of Test data With Just Tree',accuracy)
        accuracy = []
        #Pre-Pruning Accuracy of Test Data With Rules
        y_true= test_data.iloc[:,-1].values
        answer = predict_preprunedtree(all_rules,test_data)
        accuracy = accuracy_score(y_true, answer)
        f1_sc = f1_score(y_true, answer)
        recall_sc = recall_score(y_true, answer)
        print('Accuracy of Test Data With Rules',accuracy)
        print('f1_score of Test Data With Rules',f1_sc)
        print('recall_score of Test Data With Rules',recall_sc)
        #print('Number of Rules before pruning',len(all_rules))
        mean_bprun.append(accuracy)
        #Post Pruning Accuracy with Rules
        all_rules,maping = prune(all_rules,val_data)
        #print('Number of Rules after pruning',len(all_rules))
        answer = predict_prunedtree(all_rules,test_data,maping)
        # y_true= test_data.iloc[:,-1].values
        accuracy = accuracy_score(y_true, answer)
       # print('Post-Pruned Accuracy Without Noise',accuracy)
        meanacc.append(accuracy)
        meanf1_score.append(f1_sc)
        meanrecall_score.append(recall_sc)

        
    print('Mean Accuracy So Far on Test:',sum(meanacc) / len(meanacc))
    print('The Mean Accuracy of Classifier',sum(mean_bprun) / len(mean_bprun))
    print('The Mean Variance of Classifier',np.var(np.array(mean_bprun)))
    print('Mean f1_score So Far on Test:',sum(meanf1_score) / len(meanf1_score))
    print('Mean recall_score So Far on Test:',sum(meanrecall_score) / len(meanrecall_score))





Accuracy of Test Data With Rules 0.9494949494949495
f1_score of Test Data With Rules 0.9523809523809523
recall_score of Test Data With Rules 0.9259259259259259
before pruning: 539
before pruning: 300
Accuracy of Test Data With Rules 0.9090909090909091
f1_score of Test Data With Rules 0.9142857142857144
recall_score of Test Data With Rules 0.8727272727272727
before pruning: 758
before pruning: 423
Accuracy of Test Data With Rules 0.8585858585858586
f1_score of Test Data With Rules 0.8727272727272727
recall_score of Test Data With Rules 0.8571428571428571
before pruning: 689
before pruning: 299
Accuracy of Test Data With Rules 0.8585858585858586
f1_score of Test Data With Rules 0.8541666666666666
recall_score of Test Data With Rules 0.9111111111111111
before pruning: 636
before pruning: 454
Accuracy of Test Data With Rules 0.9183673469387755
f1_score of Test Data With Rules 0.9166666666666666
recall_score of Test Data With Rules 0.9361702127659575
before pruning: 885
before pruning: 491


### Implementing Oversampling

In this we have done over sampling to deal with the highly imbalanced dataset. So over sampling gives equal number of majority and minority class samples which are equal to number of majority class samples of the original dataset.
Here we have two classes fraudelent and non-fraudalent where fraudelent is the minority class represented by 1 and non-fraudalent is majority class represented by 0.
As the size of the dataset has been increased (double the size of original dataset) and hence taking a lot of time to implement decision tree algorithms,so to deal with this we have taken a sample having equal distribution of fraudelent and non fraudelent data. 


In [3]:
X_resampled_O, y_resampled_O = SMOTE().fit_resample(X, y)
X_resampled_df=pd.DataFrame(X_resampled_O)
y_resampled_df=pd.DataFrame(y_resampled_O)
#df_new=X_resampled_df.join(y_resampled_df)
y_resampled_df=y_resampled_df.rename(columns={0:30})
df_new=pd.concat([X_resampled_df, y_resampled_df], axis=1)
print(sorted(collections.Counter(y_resampled_O).items()))
df_new.columns=df.keys()
df_new.head()



[(0, 284315), (1, 284315)]


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
df_new_1 = df_new.loc[df_new['Class'] == 1]
df_new_0 = df_new.loc[df_new['Class'] == 0]
df_new_0_subset = df_new_0.sample(frac=0.001)
df_new_1_subset = df_new_1.sample(frac=0.001)
print("resampled class 0 subset shape: {}".format(df_new_0_subset.shape))
print("resampled class 1 subset shape: {}".format(df_new_1_subset.shape))
print("class 1 to class 0 ratio = 1 : {}".format(df_new_0_subset.shape[0]/df_new_1_subset.shape[0]))

resampled class 0 subset shape: (284, 31)
resampled class 1 subset shape: (284, 31)
class 1 to class 0 ratio = 1 : 1.0


In [6]:
df_new2=pd.concat([df_new_0_subset, df_new_1_subset])
df_new2.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
91844,63651.0,1.261364,-1.570609,1.164633,-1.227305,-1.956697,0.451474,-1.687024,0.306362,-1.274451,...,-0.132478,0.056615,0.037879,0.016463,0.119284,-0.18835,0.080262,0.028114,67.5,0
149421,91207.0,-0.615437,0.540314,1.348058,0.455041,0.488695,1.043549,0.710867,0.306068,1.392721,...,-0.535097,-1.373765,0.2336,0.104334,-0.283933,-1.175535,0.109232,0.149126,99.8,0
77816,57226.0,0.860384,-0.525465,1.252583,1.337996,-1.131,0.20757,-0.70088,0.258361,0.768112,...,0.34838,0.753101,-0.142318,0.074414,0.209207,-0.220112,0.060673,0.061285,140.0,0
190037,128662.0,-0.294122,1.123794,0.040958,-0.665765,1.281603,-0.162226,1.105,-0.192295,-0.16956,...,-0.214339,-0.388622,-0.067642,0.044649,-0.317071,-0.578125,0.296198,0.258734,1.51,0
249623,154501.0,-0.955902,1.419717,0.142302,-0.478265,0.956014,-0.234302,1.066341,-0.331764,0.454253,...,-0.462791,-0.741062,-0.232627,-1.054904,-0.122503,0.421441,0.367961,0.170138,8.07,0


### Spliting Test and Train data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_O, y_resampled_O, test_size=0.33, random_state=42)

### C4.5 Implementation

In [9]:
import random
parent_node = None
# the node class that will make up the tree
class decisionTreeNode():
    def __init__(self, is_leaf_node, classification, attribute_split_value, parent, left_child, right_child, height):

        self.classification = None
        self.attribute_split = None
        self.attribute_split_value = None
        self.parent = parent
        self.left_child = None
        self.right_child = None
        self.height = None
        self.is_leaf_node = True



#Split the data based on the feature and a value to data above and data below
def split_data(data, split_column, split_value):
    
    split_column_values = data[:, split_column]

    data_below = data[split_column_values <= split_value]
    data_above = data[split_column_values >  split_value]
    
    return data_below, data_above

#Get all the boundary values for each features (Key is feature and values are the splits)
def get_potential_splits(data):
    
    potential_splits = {}
    _, n_columns = data.shape
    for column_index in range(n_columns - 1):        # excluding the last column which is the label
        potential_splits[column_index] = []
        values = data[:, column_index]
        unique_values = np.unique(values)

        for index in range(len(unique_values)):
            if index != 0:
                current_value = unique_values[index]
                previous_value = unique_values[index - 1]
                potential_split = (current_value + previous_value) / 2
                
                potential_splits[column_index].append(potential_split)
        if (unique_values.shape[0] == 1):
            potential_split = unique_values[index]
            
            potential_splits[column_index].append(potential_split)

    
    return potential_splits

#Calculates Entropy of the data given
def calculate_entropy(data):
    
    label_column = data[:, -1]
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

#Calculates the entropy of data below and data above
def calculate_overall_entropy(data_below, data_above):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below) 
                      + p_data_above * calculate_entropy(data_above))
    
    return overall_entropy

#Check if all data is of same class
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

#Classify data based on majority
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification

#Gives the best feature and its split value after checking all features based on gain ratio
def determine_best_split(data, potential_splits):
    
    entropy_label = calculate_entropy(data)   
    overall_gain = -1.0
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below, data_above)
            current_information_gain = entropy_label - current_overall_entropy
            current_splitting_info = splitting_information(data_below,data_above)
            if current_splitting_info == 0:
                current_gain_ratio = 0
            else:
                current_gain_ratio = float(current_information_gain / current_splitting_info)

            if current_gain_ratio >= overall_gain:
                overall_gain = current_gain_ratio
                best_split_column = column_index
                best_split_value = value
    
    return best_split_column, best_split_value

#Calculates the splitting Info of data above and below for that boundary value
def splitting_information(data_below,data_above):
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below)/ n
    p_data_above = len(data_above) / n

    if p_data_below == 0:
        splitting_info = p_data_above * np.log2(p_data_above)
    elif p_data_above == 0:
        splitting_info = p_data_below * np.log2(p_data_below)
    else:
        splitting_info = -p_data_below * np.log2(p_data_below) -p_data_above * np.log2(p_data_above) 
    
    return splitting_info

def decision_tree_algorithm(df, parent_node,counter=0, min_samples=3):
    node = decisionTreeNode(True, None, None, parent_node, None, None, 0)
    # data preparations
    if counter == 0:
        global COLUMN_HEADERS
        COLUMN_HEADERS = df.columns
        data = df.values
    else:
        data = df           
    
    
    # base cases
    if (check_purity(data)) or (len(data) < min_samples):
        classification = classify_data(data)
        node.is_leaf_node = True
        node.classification = classification
        return node

    
    # recursive part
    else:    
        counter += 1

        # helper functions 
        potential_splits = get_potential_splits(data)
        split_column, split_value = determine_best_split(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        node.is_leaf_node = False
        # instantiate sub-tree
        feature_name = COLUMN_HEADERS[split_column]
        
        question = "{} <= {}".format(feature_name, split_value)


       
        if (parent_node == None):
            node.height = 0
        else:
            node.parent = parent_node
            node.height = node.parent.height + 1


        node.attribute_split = feature_name
        node.attribute_split_value = split_value

        # find answers (recursion)
        node.left_child = decision_tree_algorithm(data_below,node, counter, min_samples)
        node.right_child = decision_tree_algorithm(data_above,node, counter, min_samples)
        

        return node

def get_paths(root, path, pathlen,all_paths,val):
    if (root==None):
        return
    
    if root.is_leaf_node == True: 
        path.append(root.classification) 
    else:
        path.append('row[\'' + root.attribute_split + '\']' + val + str(root.attribute_split_value))
        
    pathlen= pathlen+1
    if (root.left_child == None and root.right_child == None): # If leaf, append current path
        add = path[:]
        all_paths.append(add)
        path.pop()
        root = root.parent
    else:
        get_paths(root.left_child, path, pathlen,all_paths,' <= ')
        path[pathlen-1]= 'row[\'' + root.attribute_split + '\']' +' > ' + str(root.attribute_split_value)
        get_paths(root.right_child, path,pathlen,all_paths,' <= ')
        path.pop()

    return all_paths

def classify_test_data(root,data):
    predictions = []
    tree = root 
    data = data.iloc[:, :-1]
    for index, sample in data.iterrows():
        root = tree
        while(tree.is_leaf_node!=True):
            if (sample.loc[tree.attribute_split] <= tree.attribute_split_value):
                tree = tree.left_child
            else:
                tree = tree.right_child
        predictions.append(tree.classification)
        tree = root

    return predictions

def calc_accuracy_rule(rule,test):
    wrong = 0
#Check how many classified correctly
    for index, row in test.iterrows():
        s=0
        while(s<len(rule)-1):
            if (eval(rule[s])== False):
                wrong += 1
                break
            s=s+1
    #Initial Accuracy of one Rule before pruning
    accuracy = (test.shape[0]-wrong) / test.shape[0]
    return accuracy


def recursive_len(item):
    if type(item) == list:
        return sum(recursive_len(subitem) for subitem in item)
    else:
        return 1
    
def prune(all_rules,val_data):
    acc_rlist = []
    maping = []
    rulenos = []
    #What are the labels in my val data
    ctoprune = val_data['Class'].unique()
#     Loop at all rules one by one
    size_of_rules = recursive_len(all_rules)
    print("before pruning:",size_of_rules)
    for i in range(len(all_rules)):
        init_accuracy = 0
        #Loop only on the rules applicable to my valset
        if all_rules[i][-1] in ctoprune:
                #Get the label of the Rule
                label = all_rules[i][-1]
                #Get all samples for that label
                test = val_data[val_data['Class']==label]
                #Check Initial Accuracy of the rule
                init_accuracy = calc_accuracy_rule(all_rules[i],test)
                
                temp = all_rules[i][:]
                pruned_accuracy = -1
                while (init_accuracy!=pruned_accuracy):
                # if (init_accuracy!=pruned_accuracy):
                    for x in range(len(all_rules[i])-1):
                        del temp[x]
                        accuracy = calc_accuracy_rule(temp,test)
                        if accuracy > init_accuracy:
                            delx = x
                            init_accuracy = accuracy
                        temp = all_rules[i][:]
                    # Ensure variable is defined
                    try:
                        delx
                    except NameError:
                        delx = None

                    if delx is not None:
                        del all_rules[i][delx]
                        del delx
                        # pruned_accuracy = init_accuracy
                        if (len(all_rules[i])== 2):
                            pruned_accuracy = init_accuracy
                    else:
                        pruned_accuracy = init_accuracy
        else:
            pruned_accuracy = init_accuracy
        acc_rlist.append(pruned_accuracy)
        rulenos.append(i)
    maping.append(acc_rlist)
    maping.append(rulenos)
    size_of_rules = recursive_len(all_rules)
    print("before pruning:",size_of_rules)
    maping= np.array(maping)
    maping = pd.DataFrame(maping.T)
    maping = maping.sort_values(0,ascending=False)
    maping = pd.DataFrame(maping)
    return all_rules,maping
                
def predict_prunedtree(all_rules,test_data,maping):
    answer = []
    unclassified = 0
    for index, row in test_data.iterrows():
        for indo,valus in maping.iterrows():
            rule = all_rules[int(valus[1])]
            s=0
            count = 0
            while(s<len(rule)-1):
                if (eval(rule[s])== True):
                    count = count + 1
                s = s+1
            if (count == len(rule)-1):
                prediction = rule[-1]
                break
        try:
            prediction
        except NameError:
            prediction = None
        if prediction is not None:
            answer.append(prediction)
            del prediction
        else:
            unclassified = unclassified + 1
#     print('Unclassified Sample',unclassified)
    return answer

def predict_preprunedtree(all_rules,test_data):
    answer = []
    unclassified = 0
    for index, row in test_data.iterrows():
        # prediction = row[-1]
        for rule in all_rules:
            s=0
            count = 0
            while(s<len(rule)-1):
                if (eval(rule[s])== True):
                    count = count + 1
                s = s+1
            if (count == len(rule)-1):
                prediction = rule[-1]
                break
        try:
            prediction
        except NameError:
            prediction = None
        if prediction is not None:
            answer.append(prediction)
            del prediction
        else:
            unclassified = unclassified + 1
#     print('Unclassified Sample',unclassified)
    return answer

def add_noise2(num,data):

    siz_d = data.shape[0]
    indx = int((num * siz_d)/100)
    for x in range(indx):
        pick = random.randint(0,int(siz_d/2))
        label = data.iloc[pick:,-1].values[0]
        if label > 0:
            data.iloc[pick:,-1] = data.iloc[pick:,-1] - 1
        else:
            data.iloc[pick:,-1] = data.iloc[pick:,-1] + 1
    return data

def add_noise1(num,data):
    siz_d = data.shape[0]
    indx = int((num * siz_d)/100)
    count = 0
    for x in range(indx):
        count = count+1
        pick = random.randint(0,siz_d)
        label = data.iloc[pick:,-1].values[0]
        if label > 0:
            label = label + 1
        else:
            label = label - 1
        last = data.shape[0]  
        data = data.append(data.iloc[pick,:])
        data.iloc[last,-1] = label
    return data

noise1_5 = []
noise1_10 = []
noise1_15 = []
noise2_5 = []
noise2_10 = []
noise2_15 = []

if __name__ == "__main__":
    meanacc= []
    meanf1_score= []
    meanrecall_score= []
    maping = []
    mean_bprun = []
    mean_bprun1 = []
    mean_bprun2 = []
    rkf = RepeatedKFold(n_splits=10, n_repeats=2, random_state=2652124)
    for train_index, test_index in rkf.split(df_new2):
        parent_node = None
        test_data = df_new2.iloc[test_index]
        train_data,val_data= train_test_split(df_new2.iloc[train_index], test_size=0.2)
        
        tree = decision_tree_algorithm(train_data,parent_node)
        all_rules = get_paths(tree,[],0,[],' <= ')


        
# #         #Validation
# #         y_true= val_data.iloc[:,-1].values
# #         y_pred = classify_test_data(tree,val_data)
# #         accuracy = accuracy_score(y_true, y_pred)
# #         print('Pre-Pruning Accuracy of Val data With Just Tree',accuracy)
        
#         # # Pre-Pruning Accuracy of Test Data With Tree
#         # y_true= test_data.iloc[:,-1].values
#         # y_pred = classify_test_data(tree,test_data)
#         # accuracy = accuracy_score(y_true, y_pred)
#         # print('Pre-Pruning Accuracy of Test data With Just Tree',accuracy)
        accuracy = []
    
        #Pre-Pruning Accuracy of Test Data With Rules
        y_true= test_data.iloc[:,-1].values
        answer = predict_preprunedtree(all_rules,test_data)
        accuracy = accuracy_score(y_true, answer)
        f1_sc = f1_score(y_true, answer)
        recall_sc = recall_score(y_true, answer)
        print('Accuracy of Test Data With Rules',accuracy)
        print('f1_score of Test Data With Rules',f1_sc)
        print('recall_score of Test Data With Rules',recall_sc)
        #print('Number of Rules before pruning',len(all_rules))
        mean_bprun.append(accuracy)
        mean_bprun1.append(f1_sc)
        mean_bprun2.append(recall_sc)
        #Post Pruning Accuracy with Rules
        all_rules,maping = prune(all_rules,val_data)
        #print('Number of Rules after pruning',len(all_rules))
        answer = predict_prunedtree(all_rules,test_data,maping)
        # y_true= test_data.iloc[:,-1].values
        accuracy = accuracy_score(y_true, answer)
        #print('Post-Pruned Accuracy Without Noise',accuracy)
        meanacc.append(accuracy)
        meanf1_score.append(f1_sc)
        meanrecall_score.append(recall_sc)


  
   
    print('Mean Accuracy So Far on Test:',sum(meanacc) / len(meanacc))
    print('Mean f1_score So Far on Test:',sum(meanf1_score) / len(meanf1_score))
    print('Mean recall_score So Far on Test:',sum(meanrecall_score) / len(meanrecall_score))
    print('The Mean Accuracy of Classifier ',sum(mean_bprun) / len(mean_bprun))
    print('The Mean f1_score of Classifier ',sum(mean_bprun1) / len(mean_bprun1))
    print('The Mean recall_score of Classifier ',sum(mean_bprun2) / len(mean_bprun2))
    print('The Mean Variance of Classifier ',np.var(np.array(mean_bprun)))
   

Accuracy of Test Data With Rules 0.9649122807017544
f1_score of Test Data With Rules 0.9666666666666666
recall_score of Test Data With Rules 1.0
before pruning: 117
before pruning: 70
Accuracy of Test Data With Rules 0.8947368421052632
f1_score of Test Data With Rules 0.8636363636363636
recall_score of Test Data With Rules 0.8636363636363636
before pruning: 129
before pruning: 70
Accuracy of Test Data With Rules 0.8947368421052632
f1_score of Test Data With Rules 0.8846153846153846
recall_score of Test Data With Rules 0.7931034482758621
before pruning: 91
before pruning: 68
Accuracy of Test Data With Rules 0.9473684210526315
f1_score of Test Data With Rules 0.9491525423728813
recall_score of Test Data With Rules 1.0
before pruning: 73
before pruning: 35
Accuracy of Test Data With Rules 1.0
f1_score of Test Data With Rules 1.0
recall_score of Test Data With Rules 1.0
before pruning: 211
before pruning: 132
Accuracy of Test Data With Rules 0.9649122807017544
f1_score of Test Data With Ru

In [15]:
X_new2 = df_new2.iloc[:,:-1]
y_new2 = df_new2.iloc[:,-1]
X_new2.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
243249,151854.0,0.695087,-2.570941,-1.623645,0.686052,-0.967187,-0.129365,0.51587,-0.175081,0.678933,...,1.184244,0.411233,-0.272745,-0.436602,-0.434089,-0.527159,0.477314,-0.193508,0.047603,693.2
53653,46088.0,1.344553,-0.56967,0.069459,-0.814642,-0.757287,-0.66099,-0.43841,-0.125444,-1.12391,...,0.148648,0.260735,0.579145,-0.194078,0.03735,0.629855,-0.116573,-0.016747,0.002344,43.1
140721,83890.0,1.069853,0.071475,0.484912,1.346547,-0.249256,0.017255,-0.031922,0.135633,0.077774,...,-0.173677,-0.029689,0.050721,-0.061816,0.221879,0.584474,-0.34296,0.030084,0.010263,27.62
152136,96953.0,1.845394,0.31122,0.276847,4.045099,-0.221962,0.279406,-0.571115,0.022169,1.002635,...,-0.299907,0.107481,0.591033,0.119729,-0.194731,-0.196962,0.069587,-0.022024,-0.037917,26.97
273033,165388.0,2.048592,-0.110847,-1.206953,0.208453,0.101991,-0.667002,0.069879,-0.160737,0.313878,...,-0.20473,-0.264155,-0.660785,0.280766,-0.40754,-0.277268,0.202873,-0.075116,-0.074302,0.89


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_new2, y_new2, test_size=0.33, random_state=42)

In [10]:
max_test_score = 0
MaxDepth=[]
TestScore=[]
TrainScore=[]
for i in range(1, 20):
    MaxDepth.append(i)
    clf_dt = DecisionTreeClassifier(max_depth=i)
    clf_dt.fit(X_train, y_train)
    train_score = clf_dt.score(X_train, y_train)
    test_score = clf_dt.score(X_test, y_test)
    TestScore.append(test_score)
    TrainScore.append(train_score)
    if test_score > max_test_score:
        related_train_score = train_score
        max_test_score = test_score
        max_i = i
        best_clf_dt = clf_dt
print("depth: ", max_i, "train: ", related_train_score)
print("depth: ", max_i, "test: ", max_test_score)

depth:  15 train:  1.0
depth:  15 test:  0.973404255319149


In [11]:
clf_gini = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15, max_features=None, max_leaf_nodes=None, min_samples_leaf=5, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=100, splitter='best')
clf_gini.fit(X_train, y_train)
clf_gini.score(X_train, y_train)

0.9763157894736842

In [12]:
y_pred_en_gini = clf_gini.predict(X_test)
y_pred_en_gini

array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0])

In [14]:
print(confusion_matrix(y_test, y_pred_en_gini))  
print("\t\tConfusion matrix for CART")
print(classification_report(y_test, y_pred_en_gini)) 

[[97  3]
 [ 8 80]]
		Confusion matrix for CART
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       100
           1       0.96      0.91      0.94        88

   micro avg       0.94      0.94      0.94       188
   macro avg       0.94      0.94      0.94       188
weighted avg       0.94      0.94      0.94       188

