In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from ML_Data import ML_Data

# 1a. Classification tree Outputs


In [2]:
headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
target = 'class'
careval = ML_Data('data/car+evaluation/car.data', headers)
careval.data = careval.replace_categories(dict({'low':0, 'med': 1, 'high':2, 'vhigh':3, 'small': 0, 'big': 2, 'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3}))
careval.data = careval.replace_categories(dict({'2':0, '3':1, '4':2, '5more': 3}), columns=['doors'])
train1s, train2s, kx2tests = careval.crossvalid_kx2(5)

ce_train_features = train1s[0].drop([target], axis = 1)
ce_train_labels = train1s[0][target]
ce_test_features = train2s[0].drop([target], axis = 1)
ce_test_labels = train2s[0][target]
ce_prune_features = kx2tests[0].drop([target], axis = 1)
ce_prune_labels = kx2tests[0][target]

print(f'{ce_train_labels.shape[0]} rows in training set')
print(f'{ce_test_labels.shape[0]} rows in testing set')
print(f'{ce_prune_labels.shape[0]} rows in pruning set')


691 rows in training set
691 rows in testing set
346 rows in pruning set


In [3]:
ce_tree = careval.Generate_Tree(ce_train_features, ce_train_features[[]], ce_train_labels, 0.01)
full_acc, preds = careval.evaluate_tree(ce_tree, ce_test_features, ce_test_labels, classify=True)
print(f'Full tree accuracy: {full_acc}')
print(f'Full tree parent nodes: {len(careval.vertices(ce_tree, location=[], node_list=[]))}')

pruned_ce_tree = careval.iter_prune(ce_tree, ce_prune_features, ce_prune_labels, classify=True)
pruned_acc, preds = careval.evaluate_tree(pruned_ce_tree, ce_test_features, ce_test_labels, classify=True)
print(f'Pruned tree accuracy: {pruned_acc}')
print(f'Pruned tree parent nodes: {len(careval.vertices(pruned_ce_tree, location=[], node_list=[]))}')


Full tree accuracy: 0.8986975397973951
Full tree parent nodes: 59
Pruned tree accuracy: 0.8769898697539797
Pruned tree parent nodes: 40


# 1b. Regression tree outputs

In [4]:
headers = ['Vendor Name', 'Model Name', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
target = 'PRP'
computer = ML_Data('data/computer+hardware/machine.data', headers)
display(computer.data.head())
train1s, train2s, kx2tests = computer.crossvalid_kx2(5)

com_train_features = train1s[0].drop(['Vendor Name', 'Model Name', 'ERP', target], axis = 1)
com_train_labels = train1s[0][target]
com_test_features = train2s[0].drop(['Vendor Name', 'Model Name', 'ERP', target], axis = 1)
com_test_labels = train2s[0][target]
com_prune_features = kx2tests[0].drop(['Vendor Name', 'Model Name', 'ERP', target], axis = 1)
com_prune_labels = kx2tests[0][target]

print(f'{com_train_labels.shape[0]} rows in training set')
print(f'{com_test_labels.shape[0]} rows in testing set')
print(f'{com_prune_labels.shape[0]} rows in pruning set')

Unnamed: 0,Vendor Name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


83 rows in training set
84 rows in testing set
42 rows in pruning set


In [5]:
computer_tree = computer.Generate_Tree(com_train_features[[]], com_train_features, com_train_labels, theta = 0, classify=False)
full_mse, preds = computer.evaluate_tree(computer_tree, com_test_features, com_test_labels, classify=False)
print(f'Full tree mse: {full_mse}')
print(f'Full tree parent nodes: {len(computer.vertices(computer_tree, location=[], node_list=[]))}')

pruned_com_tree = computer.iter_prune(computer_tree, com_prune_features, com_prune_labels, classify=False)
pruned_mse, preds = computer.evaluate_tree(pruned_com_tree, com_test_features, com_test_labels, classify=False)
print(f'Pruned tree mse: {pruned_mse}')
print(f'Pruned tree parent nodes: {len(computer.vertices(pruned_com_tree, location=[], node_list=[]))}')

Full tree mse: 81.6226881909266
Full tree parent nodes: 77
Pruned tree mse: 77.37169316021685
Pruned tree parent nodes: 25


# 2a. Sample classification tree without pruning


In [6]:
parent_nodes = careval.vertices(ce_tree, location=[], node_list=[])
display(parent_nodes)

[[],
 [0],
 [0, 1],
 [0, 1, 0],
 [0, 1, 0, 0],
 [0, 1, 0, 1],
 [0, 1, 0, 1, 2],
 [0, 1, 0, 2],
 [0, 1, 0, 2, 2],
 [0, 1, 0, 3],
 [0, 1, 1],
 [0, 1, 1, 0],
 [0, 1, 1, 2],
 [0, 1, 2],
 [0, 1, 2, 0],
 [0, 1, 2, 1],
 [0, 1, 2, 2],
 [0, 1, 2, 3],
 [0, 2],
 [0, 2, 0],
 [0, 2, 0, 1],
 [0, 2, 0, 2],
 [0, 2, 0, 2, 0],
 [0, 2, 1],
 [0, 2, 1, 1],
 [0, 2, 1, 2],
 [0, 2, 2],
 [0, 2, 2, 0],
 [0, 2, 2, 1],
 [0, 2, 3],
 [0, 2, 3, 1],
 [0, 2, 3, 2],
 [0, 2, 3, 3],
 [1],
 [1, 0],
 [1, 0, 0],
 [1, 0, 0, 0],
 [1, 0, 0, 2],
 [1, 0, 1],
 [1, 0, 1, 0],
 [1, 0, 1, 1],
 [1, 0, 1, 3],
 [1, 0, 2],
 [1, 0, 2, 0],
 [1, 0, 3],
 [1, 0, 3, 2],
 [1, 0, 3, 2, 1],
 [1, 1],
 [1, 1, 0],
 [1, 1, 1],
 [1, 1, 2],
 [1, 1, 2, 1],
 [1, 1, 2, 1, 0],
 [1, 1, 2, 3],
 [1, 1, 3],
 [1, 1, 3, 0],
 [1, 1, 3, 0, 1],
 [1, 1, 3, 2],
 [1, 1, 3, 3]]

In [7]:
print(len(parent_nodes))

59


# 2b. Sample classification tree with pruning

In [8]:
parent_nodes = careval.vertices(pruned_ce_tree, location=[], node_list=[])
display(parent_nodes)

[[],
 [0],
 [0, 1],
 [0, 1, 1],
 [0, 1, 1, 0],
 [0, 1, 1, 2],
 [0, 1, 2],
 [0, 1, 2, 0],
 [0, 1, 2, 1],
 [0, 1, 2, 2],
 [0, 1, 2, 3],
 [0, 2],
 [0, 2, 0],
 [0, 2, 0, 1],
 [0, 2, 0, 2],
 [0, 2, 0, 2, 0],
 [0, 2, 1],
 [0, 2, 1, 1],
 [0, 2, 1, 2],
 [0, 2, 2],
 [0, 2, 2, 0],
 [0, 2, 2, 1],
 [0, 2, 3],
 [0, 2, 3, 1],
 [0, 2, 3, 2],
 [0, 2, 3, 3],
 [1],
 [1, 0],
 [1, 0, 1],
 [1, 0, 1, 3],
 [1, 0, 3],
 [1, 0, 3, 2],
 [1, 0, 3, 2, 1],
 [1, 1],
 [1, 1, 0],
 [1, 1, 1],
 [1, 1, 2],
 [1, 1, 2, 1],
 [1, 1, 2, 1, 0],
 [1, 1, 2, 3]]

In [9]:
print(len(parent_nodes))

40


# 3a. Sample regression tree without pruning


In [10]:
parent_nodes = computer.vertices(computer_tree, location=[], node_list=[])
display(parent_nodes)

[[],
 [0],
 [0, 0],
 [0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 1, 1],
 [0, 0, 0, 0, 0, 0, 1, 1, 1],
 [0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 1, 1],
 [0, 0, 0, 0, 0, 1, 1, 0],
 [0, 0, 0, 0, 0, 1, 1, 0, 1],
 [0, 0, 0, 0, 0, 1, 1, 0, 1, 1],
 [0, 0, 0, 0, 0, 1, 1, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 1],
 [0, 0, 0, 0, 1, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 1, 0, 0, 0, 1, 1],
 [0, 0, 0, 0, 1, 0, 0, 1],
 [0, 0, 0, 0, 1, 0, 1],
 [0, 0, 0, 0, 1, 0, 1, 0],
 [0, 0, 0, 0, 1, 0, 1, 0, 1],
 [0, 0, 0, 0, 1, 0, 1, 1],
 [0, 0, 0, 0, 1, 0, 1, 1, 0],
 [0, 0, 0, 0, 1, 0, 1, 1, 0, 0],
 [0, 0, 0, 0, 1, 0, 1, 1, 0, 1],
 [0, 0, 0, 0, 1, 0, 1, 1, 1],
 [0, 0, 0, 0, 1, 1],
 [0, 0, 0, 0, 1, 1, 0],
 [0, 0, 0, 0, 1, 1, 0, 0],
 [0, 0, 0, 0, 1, 1, 0, 0, 1],
 [0, 0, 0, 0, 1, 1, 0, 1],
 [0, 0, 0, 0, 1, 1, 0, 1, 1],
 [0, 0, 0, 0, 1, 1, 1],
 [0, 0, 0, 1],
 [

In [11]:
print(len(parent_nodes))

77


# 3b. Sample regression tree pruning

In [12]:
parent_nodes = computer.vertices(pruned_com_tree, location=[], node_list=[])
display(parent_nodes)

[[],
 [0],
 [0, 0],
 [0, 0, 0],
 [0, 0, 0, 0],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 1],
 [0, 0, 1],
 [0, 0, 1, 0],
 [0, 0, 1, 0, 1],
 [0, 0, 1, 0, 1, 0],
 [0, 0, 1, 0, 1, 0, 0],
 [0, 0, 1, 0, 1, 0, 0, 0],
 [0, 0, 1, 0, 1, 0, 0, 1],
 [0, 0, 1, 0, 1, 0, 1],
 [0, 0, 1, 0, 1, 0, 1, 1],
 [0, 0, 1, 1],
 [0, 0, 1, 1, 0],
 [0, 1],
 [0, 1, 1],
 [0, 1, 1, 1],
 [0, 1, 1, 1, 0],
 [0, 1, 1, 1, 1],
 [1]]

In [13]:
print(len(parent_nodes))

25


# 4a. Calculation of information gain, gain ratio,

In [14]:
import math

labels = pd.Series([0, 0, 0, 0, 1, 1, 1, 1])
highly_pure_splits = [pd.Series([0, 0, 0, 0]), pd.Series([1, 1, 1, 1])]

nobs = labels.shape[0]
E_pi = 0
IV = 0
for label_split in highly_pure_splits:
    p_level = len(label_split)/nobs #sample probability that feature = level
    E_pi = E_pi + p_level * careval.entropy(label_split)
    IV = IV - p_level * math.log(p_level)
split_metric = (careval.entropy(labels) - E_pi) / IV # gain ratio

print(f'H(D_pi): {careval.entropy(labels)}')
print(f'E_pi(f_i): {E_pi}')
print(f'IV(f_i): {IV}')
print(f'gain ratio: {split_metric}')


H(D_pi): 0.6931471805599453
E_pi(f_i): 0.0
IV(f_i): 0.6931471805599453
gain ratio: 1.0


In [15]:
labels = pd.Series([0, 0, 0, 0, 1, 1, 1, 1])

highly_pure_splits = [pd.Series([0, 0, 0, 0]), pd.Series([1, 1, 1, 1])]
print(careval.split_feature(labels, highly_pure_splits, classify=True))

less_pure_splits = [pd.Series([0, 0, 0, 1]), pd.Series([1, 1, 1, 0])]
print(careval.split_feature(labels, less_pure_splits, classify=True))

not_pure_splits = [pd.Series([0, 1, 0, 1]), pd.Series([0, 1, 1, 0])]
print(careval.split_feature(labels, not_pure_splits, classify=True))


1.0
0.1887218755408672
0.0


# 4b. Calculation of mse

In [16]:
labels = pd.Series([0, 1, 2, 3, 4, 5, 6, 7])
good_splits = [pd.Series([0, 1, 2, 3]), pd.Series([4, 5, 6, 7])]


split_metric = 0
for label_split in good_splits:
    mu = label_split.mean()
    sigma_2 = 1/nobs*(label_split - mu)**2
    split_metric = split_metric + sigma_2.sum() 

print(f'mse: {split_metric}')


mse: 1.25


In [17]:
labels = pd.Series([0, 1, 2, 3, 4, 5, 6, 7])
bad_splits = [pd.Series([0, 1, 4, 6]), pd.Series([2, 3, 5, 7])]

split_metric = 0
for label_split in bad_splits:
    mu = label_split.mean()
    sigma_2 = 1/nobs*(label_split - mu)**2
    split_metric = split_metric + sigma_2.sum() 

print(f'mse: {split_metric}')


mse: 4.6875


# 5. Decision being made to prune a subtree

In [18]:
import copy

def prune(container, tree, p_features, p_labels, classify):        
    base_metric, preds = container.evaluate_tree(tree, p_features, p_labels, classify)
    parent_nodes = container.vertices(tree, location=[], node_list=[])
    if classify:
        sign = 1
    else:
        sign = -1
    code_list = parent_nodes[1:]
    for parent_node in code_list:
        dummy_tree = copy.deepcopy(tree)
        pruned_tree = container.replace_node(dummy_tree, parent_node, classify)
        pruned_metric, preds = container.evaluate_tree(pruned_tree, p_features, p_labels, classify)
        print(f'base mse: {base_metric}, pruned tree mse: {pruned_metric}')
        if (pruned_metric * sign) > (base_metric * sign):
            return pruned_tree, parent_node
    return 'no change', []

In [19]:
prune(computer, computer_tree, com_prune_features, com_prune_labels, classify=False)

base mse: 122.9176499826151, pruned tree mse: 190.24652952884017
base mse: 122.9176499826151, pruned tree mse: 137.83343169993228
base mse: 122.9176499826151, pruned tree mse: 124.66143730072592
base mse: 122.9176499826151, pruned tree mse: 123.09284436283586
base mse: 122.9176499826151, pruned tree mse: 122.6878410813653


(<DecisionTree.DecisionTree at 0x18c65c12e40>, [0, 0, 0, 0, 0])

# 6. Traversing a classification tree and a class label being assigned

In [28]:
def predict(tree, test_sample):
    '''
    Recursive traversal of Decision Tree 
    '''
    # Base case (leaf node): return prediction, error/probability
    if len(tree.children) == 0:
        return tree.feature_name, tree.keys
    # General case: choose child node, based on feature and criteria
    else:
        test_value = test_sample[tree.feature_name]
        if len(tree.keys) == 1: # numerical feature
            if test_value < tree.keys[0]:
                out = tree.children[0].predict(test_sample)
            else:
                out = tree.children[1].predict(test_sample)
        else: # categorical feature
            if test_value in tree.keys:
                test_index = tree.keys.index(test_value)
            else:
                test_index = 0
            out = tree.children[test_index].predict(test_sample)
        return out

In [26]:
sample = ce_test_features.iloc[0]
display(sample)
display(ce_test_labels.iloc[0])

buying         1
maint          1
doors          0
persons     more
lug_boot       1
safety         2
Name: 1103, dtype: object

1

In [29]:
pred, err = predict(pruned_ce_tree, sample)
print(pred)

(3, 1.0)

# 7. Traversing a regression tree and a class label being assigned

In [23]:
sample = com_test_features.iloc[0]
display(sample)
display(com_test_labels.iloc[0])

MYCT        29
MMIN      8000
MMAX     16000
CACH        32
CHMIN        8
CHMAX       16
Name: 4, dtype: int64

132

In [24]:
pred, err = predict(pruned_com_tree, sample)
print(pred)

MMIN
feature: MMIN
observed value: 8000


(77.27777777777777, 665.2006172839507)