## Decision Trees

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import log
from random import shuffle

In [2]:
class DataPoint:
    def __str__(self):
        return "< " + str(self.label) + ": " + str(self.features) + " >"
    def __init__(self, label, features):
        self.label = label # the classification label of this data point
        self.features = features


In [3]:
def get_data(filename):
    data = []
    with open(filename) as file:
        for line in file:
            record = np.fromstring(line, sep=",")
            label = record[-1]
            record = record[:-1]
            data.append(DataPoint(label, record))
    return data

def PrintDataSet(data):
    for d in data:
        print(d)

In [4]:
class TreeNode:
    is_leaf = True          
    feature_idx = None      
    thresh_val = None       
    prediction = None       
    left_child = None       
    right_child = None      
    
    def printTree(self):    # for debugging purposes
        if self.is_leaf:
            print ('Leaf Node:      predicts ' + str(self.prediction))
        else:
            print ('Internal Node:  splits on feature ' 
                   + str(self.feature_idx) + ' with threshold ' + str(self.thresh_val))
            self.left_child.printTree()
            self.right_child.printTree()

In [5]:
def make_prediction(tree_root, data_point):
    # is_leaf is when you check prediction
    # go left if less and right if greater than or equal to
    node = tree_root
    while(node.is_leaf is False):
        if data_point.features[node.feature_idx] < node.thresh_val:
            node = node.left_child
        else:
            node = node.right_child
    #Is leaf
    return node.prediction

In [7]:
def split_dataset(data, feature_idx, threshold):
    left_split = []
    right_split = []
    for d in data:
        #print("Threshold: " + str(threshold))
        #print("Feature Value: " + str(d.features[feature_idx]))
        
        if d.features[feature_idx] < threshold:
            left_split.append(d)
        else:
            right_split.append(d)
    return (left_split, right_split)

In [8]:
def calc_entropy(data):
    entropy = 0.0
    # find the number of 0's and the number of 1's then calculate
    signs = 0 #1
    noSigns = 0 #0
    for d in data:
        if d.label == 1:
            signs += 1
        else:
            noSigns += 1
    totalCount = signs + noSigns

    signPerc = signs/totalCount
    if signPerc == 0:
        signPerc = 1
    noSignPerc = noSigns / totalCount
    if noSignPerc == 0:
        noSignPerc = 1
    entropy = -(signs/totalCount)*log(signPerc,2.0) - (noSigns/totalCount)*log(noSignPerc, 2.0)
    return entropy

In [9]:
def calc_best_threshold(data, feature_idx):
    best_info_gain = 0.0
    best_thresh = None

    #continuous variable - finding best split
    # create a dict with {feature idx, label}, then sort dict and go forward with the algorithm
    
    #use split_data feature to get tuples and split, still need to get the value to split on
    # create list of feature_idx and sort, to make it more efficicent create a dict with label as value
    feature_list = []
    numOfYes = 0
    for d in data:
        feature_list.append(d.features[feature_idx])
        if d.label == 1:
            numOfYes += 1
    feature_list.sort()
    
    data_sorted = data
    data_sorted.sort(key=lambda x: x.features[feature_idx])
    
    #calculate gini of Parent
    entropy_parent = calc_entropy(data)
    
    for i in range(1, len(data_sorted)):
        #need to find correct i and i+1, where label changes
        if data_sorted[i].label == data_sorted[i-1].label and i != 1:
            continue
        if (feature_list[i] == feature_list[i-1]) and feature_list[i] == min(feature_list):
            continue
            
        # Calculate Split Value
        split_value = (feature_list[i]+feature_list[i-1]) / 2.0
        
        # Split Dataset between split_value
        left, right = split_dataset(data, feature_idx, split_value)
        
        # Calc Entropy of both left and right sides
        entropy_left = calc_entropy(left)
        entropy_right = calc_entropy(right)

        
        #calculate entropy of split (weighted average)
        entropy_split = (len(left)/len(data))*entropy_left + (len(right)/len(data))*entropy_right
        
        #calcualate gain: entropy-parent - entropy-split
        gain = entropy_parent - entropy_split
        
        if gain > best_info_gain:
            best_info_gain = gain
            best_thresh = split_value
        
    return (best_info_gain, best_thresh)



In [10]:
def identify_best_split(data):
    if len(data) < 2:
        return (None, None)
    best_feature = None

    # must loop through len(feature_idx-1) and return the feature_idx and threshold of the feature that returns the best_info_gain
    best_info_gain = 0
    best_threshold = 0
    for i in range(0,len(data[0].features)):
        gain, thresh = calc_best_threshold(data, i)
        if gain >= best_info_gain:
            best_feature = i
            best_info_gain = gain
            best_threshold = thresh
            
        
    return (best_feature, best_threshold)

In [11]:
def createLeafNode(data):

    # Loop through data and count number of signs vs no-signs for class label. Prediction is 0 or 1 dependent on which occurs most
    signs = 0 #1
    no_signs = 0 #0
    for d in data:
        if d.label == 1:
            signs += 1
        else:
            no_signs += 1
   
    tree = TreeNode()
    tree.is_leaf = True
    if signs > no_signs:
        tree.prediction = 1
    else:
        tree.prediction = 0
    return tree

In [12]:
def createDecisionTree(data, max_levels):
    # if max_level is 1, createLeafNode
    tree = TreeNode()
    tree.is_leaf = False
    if max_levels == 1:
        return createLeafNode(data)
    
    # find what to split on
    feature, thresh = identify_best_split(data)
    
    # if data only has one datapoint in it
    if feature is None or thresh is None:
        return createLeafNode(data)
    left_split, right_split = split_dataset(data, feature, thresh)
    tree.feature_idx = feature
    tree.thresh_val = thresh
    tree.left_child = createDecisionTree(left_split, max_levels-1)
    tree.right_child = createDecisionTree(right_split, max_levels-1)
    
    return tree

#dat = get_data("test.txt")
#tr = createDecisionTree(dat, 2)
#tr.printTree()

# Test
#dat = get_data("test.txt")
#print(str(identify_best_split(dat)))

In [13]:
def calcAccuracy(tree_root, data):
    correct = 0
    total = 0
    for d in data:
        pred = make_prediction(tree_root, d)
        if pred == d.label:
            correct += 1
        total += 1
    return (correct / total)

In [14]:
import time

def x_fold(data, fold):
    return [data[i::fold] for i in range(fold)]

average = 0
d = get_data("messidor_features.txt")
fold = x_fold(d, 5)

for i in range(5):
    
    test_set = fold[i]
    train_set = []
    for j in range(5):
        if j == i:
            continue
        train_set += fold[j]
    # partition data into train_set and test_set
    #train_set = d[int(len(d)/5)*i:(int(len(d)/5)*4)]
    #test_set = d[(int(len(d)/5)*(4-i)  :  (int(len(d)/5)*(5-i)]

    print ('Training set size:', len(train_set))
    print ('Test set size    :', len(test_set))

    # create the decision tree
    start = time.time()
    tree = createDecisionTree(train_set, 10)
    end = time.time()
    print ('Time taken:', end - start)

    # calculate the accuracy of the tree
    accuracy = calcAccuracy(tree, test_set)
    print ('The accuracy on the test set is ', str(accuracy * 100.0))
    average += accuracy
    #t.printTree()

print("\nThe average accuracy is " +str(average/5*100))
                                           


Training set size: 920
Test set size    : 231
Time taken: 13.38956880569458
The accuracy on the test set is  68.3982683982684
Training set size: 921
Test set size    : 230
Time taken: 18.114651918411255
The accuracy on the test set is  65.21739130434783
Training set size: 921
Test set size    : 230
Time taken: 14.83719515800476
The accuracy on the test set is  65.65217391304347
Training set size: 921
Test set size    : 230
Time taken: 16.201844930648804
The accuracy on the test set is  59.56521739130435
Training set size: 921
Test set size    : 230
Time taken: 14.071578979492188
The accuracy on the test set is  58.69565217391305

The average accuracy is 63.50574063617541
