In [None]:
''' 
Program: hw2.ipynb
Author: David Gray - Aileen Bui
Description: Homework 2
Build ID3 decision tree algorithm for classification of mushroom samples.
'''

import pandas as pd
import numpy as np
import math

In [None]:
def getEntropy(D):
    entropy = 0
    probabilities = (D['class'].value_counts() / D['class'].count()).tolist()
    for x in probabilities:
        entropy += (-x)*math.log(x,2)
    return entropy

In [None]:
def getInfoGain(D, attr, wholeEntropy):
    testEntropy = 0
    total = D[attr].count()
    for value in D[attr].unique():
        count = D[attr][D[attr] == value].count()
        testEntropy += (count/total)*(getEntropy(D[D[attr] == value]))
    infoGain = wholeEntropy - testEntropy
    return infoGain

In [None]:
def chooseBestAttribute(D):
    wholeEntropy = getEntropy(D)
    attr_list = D.columns.tolist()
    attr_list.remove('class')
    infoGain = []
    for a in attr_list:
        infoGain.append([a, getInfoGain(D, a, wholeEntropy)])
    gaindf = pd.DataFrame(infoGain, columns=['Attribute','Info Gain'])
    return gaindf['Attribute'][gaindf['Info Gain'] == gaindf['Info Gain'].max()].tolist()[0]

In [None]:
def generateDT(D):
    branches = {}
    classes = D['class'].unique().tolist()
    if len(classes) == 1: #Handle cases of only single class
        return classes[0]
    if len(D.columns.tolist()[:-1]) < 1: #Handle cases of no attribute
        return D['class'].mode()
    node = chooseBestAttribute(D)
    node_values = D[node].unique()
    node_majority = D.mode()[node][0]
    node_values = np.append(node_values,'None')
    for value in node_values:
        # If the value is missing or not within the values existing in the training data,
        # we will treat it the same with the majority value of this node
        if value == 'None': 
            branches['None'] = generateDT(D[D[node] == node_majority])
        else:
            branches[str(value)] = generateDT(D[D[node] == value])
    return (node, branches)

In [None]:
def classify(dt, test_data):
    root = dt[0]
    value = test_data[root]
    try:
        decision_rule = dt[1][value]
    except:
        decision_rule = dt[1]['None']
    if type(decision_rule) != tuple:
        return decision_rule
    return classify(decision_rule, test_data)

In [None]:
# Training data provided in Homework requirement
training_data = [
({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'}, False),
({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'yes'}, False),
({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, True),
({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
({'level':'Junior', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, False),
({'level':'Mid', 'lang':'R', 'tweets':'yes', 'phd':'yes'}, True),
({'level':'Senior', 'lang':'Python', 'tweets':'no', 'phd':'no'}, False),
({'level':'Senior', 'lang':'R', 'tweets':'yes', 'phd':'no'}, True),
({'level':'Junior', 'lang':'Python', 'tweets':'yes', 'phd':'no'}, True),
({'level':'Senior', 'lang':'Python', 'tweets':'yes', 'phd':'yes'}, True),
({'level':'Mid', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, True),
({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'}, True),
({'level':'Junior', 'lang':'Python', 'tweets':'no', 'phd':'yes'}, False)
] 

# Convert the training data into a dataframe
train_df = pd.DataFrame(columns = ['level', 'lang', 'tweets', 'phd', 'class'])
for i in range(len(training_data)):
    row = training_data[i]
    data = row[0]
    data['class'] = row[1]
    train_df = train_df.append(data, ignore_index = True)

# Call the function to generate the decision tree on training data
decision_tree = generateDT(train_df)
print('Decision tree:\n', decision_tree)

In [None]:
# Generate testing data
test_data = [
    ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'no'},False),
    ({'level':'Senior', 'lang':'Java', 'tweets':'no', 'phd':'missing values'},False),
    ({'level':'Mid', 'lang':'Java', 'tweets':'yes', 'phd':'no'}, True),]

# Apply the decision tree on test data and output the result
for i in range(len(test_data)):
    test_input = test_data[i]
    actual = test_input[1]
    predicted = classify(decision_tree, test_input[0])
    print('Test ', i, ':', '\nActual: ', actual, '\nPredicted: ', predicted, '\n---', '\nCorrect' if actual == predicted else '\nIncorrect','\n------------')

In [None]:
# Get real data as training dataset and generate the new decision
original_data = pd.read_csv('mushrooms.csv')
real_data = original_data.iloc[:-1000]
decision_tree = generateDT(real_data)
print('Decision tree:\n', decision_tree)
real_data

In [None]:
#Genertate testing data for accuracy
accuracy_test_data = original_data.iloc[-1000:]
accuracy_test_data
total_correct_count = 0

# Apply the decision tree on test data and output the result
for i in range(len(accuracy_test_data)):
    test_input = accuracy_test_data.iloc[i]
    actual = test_input.loc['class']
    predicted = classify(decision_tree, test_input)
    if actual == predicted:
        total_correct_count += 1
print('Correctly predicted: ', total_correct_count, ' out of ', len(accuracy_test_data), 'samples', '\nAccuracy: ', (total_correct_count/len(accuracy_test_data))*100, '\n------------')

In [None]:
# Generate testing data
mushroom_test_data = [
    ({'cap-shape':'x', 'cap-surface':'y', 'cap-color':'n', 'bruises':'f', 'odor':'n', 'gill-attachment':'f', 'gill-spacing':'w', 'gill-size':'n', 'gill-color':'n', 'stalk-shape':None, 'stalk-root':None, 'stalk-surface-above-ring':None, 'stalk-surface-below-ring':None, 'stalk-color-above-ring':'w', 'stalk-color-below-ring':'w', 'veil-type':None, 'veil-color':None, 'ring-number':'o', 'ring-type':'l', 'spore-print-color':'n', 'population':'n', 'habitat':'g'},'e'),
    ({'cap-shape':'f', 'cap-surface':'s', 'cap-color':'w', 'bruises':'f', 'odor':'f', 'gill-attachment':'f', 'gill-spacing':'w', 'gill-size':'n', 'gill-color':'w', 'stalk-shape':'t', 'stalk-root':'b', 'stalk-surface-above-ring':'f', 'stalk-surface-below-ring':'f', 'stalk-color-above-ring':'w', 'stalk-color-below-ring':'w', 'veil-type':'p', 'veil-color':'w', 'ring-number':'o', 'ring-type':None, 'spore-print-color':'w', 'population':'s', 'habitat':'w'},'p')]

# Apply the decision tree on test data and output the result
for i in range(len(mushroom_test_data)):
    test_input = mushroom_test_data[i]
    actual = test_input[1]
    predicted = classify(decision_tree, test_input[0])
    print('Test ', i, ':', '\nActual: ', actual, '\nPredicted: ', predicted, '\n---', '\nCorrect' if actual == predicted else '\nIncorrect','\n------------')