In [1]:
#For the implementation of the decision tree I found some Python code online to help me put it together, which the professor
#said was allowed. I went through and annotated all the code to show that I understood it. I also coded my own entropy score
#function in order to meet the homework requirements.


import math

# Splits a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
    #Initializes left and right lists for the split
    left, right = list(), list()
    #Separates dataset based on given value
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Calculate the Gini index for a split dataset
def entropy_score(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    
    #Initialize entropy score
    entropy = 0.0
    #This section calculates the entropy score for each group in the split. Then sums the scores for a total
    #entropy score of the chosen split
    for group in groups:
        size = float(len(group)) #Find the size of the current group
        if size == 0:
            continue
        score = 0.0
        #Calculate entropy score for each group based on class values and then add for total entropy score
        for class_val in classes:
            #code selects row[-1] because it assumes that the response variable of a row is held in the last column
            #Calculate proportion of how many rows belong to each class
            p = [row[-1] for row in group].count(class_val) / size 
            #Avoid dividing by 0 in case of perfect split
            if(p == 0):
                continue
            #Calculate entropy score for group 
            score += p*math.log(1/p) 
        entropy += score
    
    return entropy

# Select the best split point for a dataset
def get_split(dataset):
    #Inspect the final column of the dataset(assumed to be Y response) and get the set of class values
    class_values = list(set(row[-1] for row in dataset))
    #Initialize some values
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    #Go through each row of the dataset and create a test split. Evaluate the entropy score of that split
    #Keep the entropy score of the best split and return its associated index, value and the groups it made
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            entropy = entropy_score(groups, class_values)
            if entropy < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], entropy, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value
def to_terminal(group):
    #Creates a terminal node and returns the class value with the most outcomes in the node
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)
 
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)
 

#Recursively builds the decision tree. First creates a root, then contintues to split on the root until max_depth or min_size
#is reached
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root
 
# Print a decision tree
def print_tree(node, depth=0):
    if isinstance(node, dict):
        print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('%s[%s]' % ((depth*' ', node)))
        
# Make a prediction with a decision tree
#Takes a row from the dataset we're predicting for and follows the tree path according to the splitting rules
#Returns the predicted value for the node
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']
        


In [2]:
#Question 1 Part 2 - Use Decision Tree on Breast dataset

import pandas
import sklearn
import math
import numpy
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

import pandas
import sklearn
import math
import numpy
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

breast_df = pandas.read_csv("C:\\Users\\George\\Documents\\Rutgers\\Statistical Learning\\Homework 1\\breast_final.csv")
#Remove last column which has missing values and remove ID Column
breast_df = breast_df[breast_df.columns[1:]]
#Remove NA from Column X6
breast_df = breast_df.dropna(axis = 0)
#Convert Y column into binary
breast_df.Y.replace([2, 4], [0,1], inplace=True) 

#Convert breast dataframe into list of lists so it can work in Decision Tree code
temp = breast_df.reset_index().values.tolist()
breast = []
for row in temp:
    breast.append(row[1:])

breast_tree = build_tree(breast,8,15)
print("Decision Tree")
print_tree(breast_tree)

#Get predictions with tree
predictions = list()
for row in breast:
    prediction = predict(breast_tree, row)
    predictions.append(prediction)

#Get true response values
responses = list()
for row in breast:
    responses.append(row[-1])

#Get decision tree accuracy rate
correct = 0
for i in range(len(responses)):
    if(responses[i] == predictions[i]):
        correct += 1

print("Decision Tree Accuracy rate:", correct/len(responses))

#Run a Boosting method on the dataset
boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)

X = breast_df[breast_df.columns[0:9]]
Y = breast_df[breast_df.columns[-1]].values.tolist()
boost.fit(X,Y)
preds = boost.predict(X)

#Get Boosting Accuracy Rate
correct = 0
for i in range(len(preds)):
    if(preds[i] == Y[i]):
        correct += 1
        
print("Boosting Tree Accuracy Rate:", correct/len(preds))

Decision Tree
[X2 < 5.000]
 [X1 < 9.000]
  [X8 < 9.000]
   [X3 < 7.000]
    [X7 < 8.000]
     [X1 < 1.000]
      [0.0]
      [0.0]
     [1.0]
    [1.0]
   [1.0]
  [X1 < 10.000]
   [1.0]
   [X1 < 10.000]
    [1.0]
    [1.0]
 [X1 < 1.000]
  [1.0]
  [1.0]
Decision Tree Accuracy rate: 0.9487554904831625
Boosting Tree Accuracy Rate: 0.9809663250366032
