In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from ML_Data import ML_Data

# Breast Cancer 
Classification Numeric -> Categorical

In [2]:
headers = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin ', 'Normal Nucleoli', 'Mitoses', 'Class']
target = 'Class'
breast_cancer = ML_Data('data/breast+cancer+wisconsin+original/breast-cancer-wisconsin.data', headers)
breast_cancer.data = breast_cancer.replace_categories(dict({2:0, 4:1}))
breast_cancer.data = breast_cancer.replace_missing('?', int_val = True)
display(breast_cancer.data.head())
train1s, train2s, kx2tests = breast_cancer.crossvalid_kx2(5)
breast_cancer.data.shape

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,0,1,3,1,1,0
1,1002945,5,1,1,5,7,10,3,0,1,0
2,1015425,3,1,1,1,0,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,1,1,1,3,0,1,3,1,1,0


(699, 11)

## Null Model

In [3]:
train1s, train2s, tests = breast_cancer.kx2_classify(target, 5)

Classifier Score: 0.6214285714285716


In [5]:
full_acc = []
pruned_acc = []
full_tree_size = []
pruned_tree_size = []

for i in range(5):
    train_features = train1s[i].drop(['Sample code number', target], axis = 1)
    train_labels = train1s[i][target]
    test_features = train2s[i].drop(['Sample code number', target], axis = 1)
    test_labels = train2s[i][target]
    prune_features = kx2tests[i].drop(['Sample code number', target], axis = 1)
    prune_labels = kx2tests[i][target]
    
    bc_tree = breast_cancer.Generate_Tree(train_features[[]], train_features, train_labels, theta=0)
    acc, preds = breast_cancer.evaluate_tree(bc_tree, test_features, test_labels, classify=True)
    full_acc.append(acc)
    pruned_bc_tree = breast_cancer.iter_prune(bc_tree, prune_features, prune_labels, classify=True)
    acc, preds = breast_cancer.evaluate_tree(pruned_bc_tree, test_features, test_labels, classify=True)
    pruned_acc.append(acc)
    full_tree_size.append(len(breast_cancer.vertices(bc_tree, location=[], node_list=[])))
    pruned_tree_size.append(len(breast_cancer.vertices(pruned_bc_tree, location=[], node_list=[])))

results = pd.DataFrame({'Full tree accuracy': full_acc, 'Pruned tree accuracy': pruned_acc, 'Full tree parent nodes': full_tree_size, 'Pruned tree parent nodes': pruned_tree_size})
display(results)
print(results.mean())

Unnamed: 0,Full tree accuracy,Pruned tree accuracy,Full tree parent nodes,Pruned tree parent nodes
0,0.953571,0.953571,202,202
1,0.960714,0.960714,203,203
2,0.925,0.942857,189,101
3,0.953571,0.953571,195,195
4,0.928571,0.95,196,188


Full tree accuracy            0.944286
Pruned tree accuracy          0.952143
Full tree parent nodes      197.000000
Pruned tree parent nodes    177.800000
dtype: float64


In [6]:
print(bc_tree.feature_name)
print(bc_tree.keys)

Clump Thickness
[6.5]


# Car Evaluation
Classification categorical -> categorical

In [7]:
headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
target = 'class'
careval = ML_Data('data/car+evaluation/car.data', headers)
careval.data = careval.replace_categories(dict({'low':0, 'med': 1, 'high':2, 'vhigh':3, 'small': 0, 'big': 2, 'unacc': 0, 'acc': 1, 'good': 2, 'vgood': 3}))
careval.data = careval.replace_categories(dict({'2':0, '3':1, '4':2, '5more': 3}), columns=['doors'])
careval.data = careval.replace_categories(dict({'2':0, '4':1, 'more': 2}), columns=['persons'])
display(careval.data.head())
train1s, train2s, kx2tests = careval.crossvalid_kx2(5)
careval.data.shape


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,0,0,0,0,0
1,3,3,0,0,0,1,0
2,3,3,0,0,0,2,0
3,3,3,0,0,1,0,0
4,3,3,0,0,1,1,0


(1728, 7)

## Null Model

In [8]:
train1s, train2s, tests = careval.kx2_classify(target, 5)

Classifier Score: 0.723121387283237


In [10]:
full_acc = []
pruned_acc = []
full_tree_size = []
pruned_tree_size = []

for i in range(5):
    train_features = train1s[i].drop([target], axis = 1)
    train_labels = train1s[i][target]
    test_features = train2s[i].drop([target], axis = 1)
    test_labels = train2s[i][target]
    prune_features = kx2tests[i].drop([target], axis = 1)
    prune_labels = kx2tests[i][target]
    
    ce_tree = careval.Generate_Tree(train_features, train_features[[]], train_labels, theta=0)
    acc, preds = careval.evaluate_tree(ce_tree, test_features, test_labels, classify=True)
    full_acc.append(acc)
    pruned_ce_tree = careval.iter_prune(ce_tree, prune_features, prune_labels, classify=True)
    acc, preds = careval.evaluate_tree(pruned_ce_tree, test_features, test_labels, classify=True)
    pruned_acc.append(acc)
    full_tree_size.append(len(careval.vertices(ce_tree, location=[], node_list=[])))
    pruned_tree_size.append(len(careval.vertices(pruned_ce_tree, location=[], node_list=[])))

results = pd.DataFrame({'Full tree accuracy': full_acc, 'Pruned tree accuracy': pruned_acc, 'Full tree parent nodes': full_tree_size, 'Pruned tree parent nodes': pruned_tree_size})
display(results)
print(results.mean())

Unnamed: 0,Full tree accuracy,Pruned tree accuracy,Full tree parent nodes,Pruned tree parent nodes
0,0.894356,0.894356,417,409
1,0.885673,0.890014,408,406
2,0.89725,0.904486,401,391
3,0.908828,0.911722,401,397
4,0.913169,0.913169,405,398


Full tree accuracy            0.899855
Pruned tree accuracy          0.902750
Full tree parent nodes      406.400000
Pruned tree parent nodes    400.200000
dtype: float64


In [11]:
print(ce_tree.feature_name)
print(ce_tree.keys)

safety
[0, 2, 1]


# House Votes
Classification categorical -> categorical

In [12]:
headers = ['republican', 'handicapped_infants', 'water_project_cost_sharing', 'budget_resolution', 'physician_fee_freeze', 'el_salvador_aid', \
           'religious_groups_in_schools', 'anti_satellite_test_ban', 'aid_nicaragua_contras', 'mx_missile', 'immigration', 'synfuels_corporation_cutback', \
            'education_spending', 'superfund_right_to_sue', 'crime', 'duty_free_exports', 'export_admin_act_south_africa']
target = 'republican'
housevotes = ML_Data('data/congressional+voting+records/house-votes-84.data', headers)
housevotes.data = housevotes.replace_categories(dict({'n':0, 'y':1, 'democrat':0, 'republican':1}))
housevotes.data = housevotes.replace_missing('?', int_val = True)
display(housevotes.data.head())
train1s, train2s, kx2tests = housevotes.crossvalid_kx2(5)
housevotes.data.shape

Unnamed: 0,republican,handicapped_infants,water_project_cost_sharing,budget_resolution,physician_fee_freeze,el_salvador_aid,religious_groups_in_schools,anti_satellite_test_ban,aid_nicaragua_contras,mx_missile,immigration,synfuels_corporation_cutback,education_spending,superfund_right_to_sue,crime,duty_free_exports,export_admin_act_south_africa
0,1,0,1,0,1,1,1,0,0,0,1,0,1,1,1,0,1
1,1,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1
2,0,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,0
3,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,1
4,0,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1


(435, 17)

## Null Model

In [13]:
train1s, train2s, tests = housevotes.kx2_classify(target, 5)

Classifier Score: 0.5954022988505747


In [14]:
full_acc = []
pruned_acc = []
full_tree_size = []
pruned_tree_size = []

for i in range(5):
    train_features = train1s[i].drop([target], axis = 1)
    train_labels = train1s[i][target]
    test_features = train2s[i].drop([target], axis = 1)
    test_labels = train2s[i][target]
    prune_features = kx2tests[i].drop([target], axis = 1)
    prune_labels = kx2tests[i][target]
    
    housevotes_tree = housevotes.Generate_Tree(train_features, train_features[[]], train_labels, theta=0)
    acc, preds = housevotes.evaluate_tree(housevotes_tree, test_features, test_labels, classify=True)
    full_acc.append(acc)
    pruned_tree = housevotes.iter_prune(housevotes_tree, prune_features, prune_labels, classify=True)
    acc, preds = housevotes.evaluate_tree(pruned_tree, test_features, test_labels, classify=True)
    pruned_acc.append(acc)
    full_tree_size.append(len(housevotes.vertices(housevotes_tree, location=[], node_list=[])))
    pruned_tree_size.append(len(housevotes.vertices(pruned_tree, location=[], node_list=[])))

results = pd.DataFrame({'Full tree accuracy': full_acc, 'Pruned tree accuracy': pruned_acc, 'Full tree parent nodes': full_tree_size, 'Pruned tree parent nodes': pruned_tree_size})
display(results)
print(results.mean())

Unnamed: 0,Full tree accuracy,Pruned tree accuracy,Full tree parent nodes,Pruned tree parent nodes
0,0.954023,0.954023,138,138
1,0.936782,0.936782,118,118
2,0.896552,0.936782,130,49
3,0.965517,0.95977,132,53
4,0.948276,0.948276,132,132


Full tree accuracy            0.940230
Pruned tree accuracy          0.947126
Full tree parent nodes      130.000000
Pruned tree parent nodes     98.000000
dtype: float64


In [15]:
print(f'full tree parent nodes: {len(housevotes.vertices(housevotes_tree, location=[], node_list=[]))}')
print(f'pruned tree parent nodes: {len(housevotes.vertices(pruned_tree, location=[], node_list=[]))}')

full tree parent nodes: 132
pruned tree parent nodes: 132


In [16]:
print(housevotes_tree.feature_name)
print(housevotes_tree.keys)

physician_fee_freeze
[1, 0]


# Abalone
Regression Numeric+Categorical -> Numeric

In [8]:
headers = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
target = 'Rings'
abalone = ML_Data('data/abalone/abalone.data', headers)
display(abalone.data.head())
train1s, train2s, kx2tests = abalone.crossvalid_kx2(5)
abalone.data.shape

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


(4177, 9)

## Null Model

In [36]:
train1s, train2s, tests = abalone.kx2_regress(target, 5)

Baseline StDev: 3.2241690320681284
Mean Squared Error: 3.195195272969456


In [9]:
full_mse = []
pruned_mse = []
full_tree_size = []
pruned_tree_size = []

for i in range(5):
    print(i)
    train_features = train1s[i].drop([target], axis = 1)
    train_labels = train1s[i][target]
    test_features = train2s[i].drop([target], axis = 1)
    test_labels = train2s[i][target]
    prune_features = kx2tests[i].drop([target], axis = 1)
    prune_labels = kx2tests[i][target]

    abalone_tree = abalone.Generate_Tree(train_features[['Sex']], train_features.drop(['Sex'], axis = 1), train_labels, theta = 0, classify=False)
    mse, preds = abalone.evaluate_tree(abalone_tree, test_features, test_labels, classify=False)
    full_mse.append(mse)
    pruned_tree = abalone.iter_prune(abalone_tree, prune_features, prune_labels, classify=False)
    mse, preds = abalone.evaluate_tree(pruned_tree, test_features, test_labels, classify=False)
    pruned_mse.append(mse)
    full_tree_size.append(len(abalone.vertices(abalone_tree, location=[], node_list=[])))
    pruned_tree_size.append(len(abalone.vertices(pruned_tree, location=[], node_list=[])))

results = pd.DataFrame({'Full tree mse': full_mse, 'Pruned tree mse': pruned_mse, 'Full tree parent nodes': full_tree_size, 'Pruned tree parent nodes': pruned_tree_size})
display(results)
print(results.mean())

0
1
2
3
4


Unnamed: 0,Full tree mse,Pruned tree mse,Full tree parent nodes,Pruned tree parent nodes
0,3.071266,2.695843,1636,2
1,2.950621,2.794634,1622,2
2,2.987506,2.647792,1630,1
3,3.04583,2.696739,1651,1
4,3.107105,2.598281,1616,1


Full tree mse                  3.032466
Pruned tree mse                2.686658
Full tree parent nodes      1631.000000
Pruned tree parent nodes       1.400000
dtype: float64


In [10]:
print(f'full tree parent nodes: {len(abalone.vertices(abalone_tree, location=[], node_list=[]))}')
print(f'pruned tree parent nodes: {len(abalone.vertices(pruned_tree, location=[], node_list=[]))}')

full tree parent nodes: 1616
pruned tree parent nodes: 1


In [11]:
print(abalone_tree.feature_name)
print(abalone_tree.keys)

Shell weight
[0.159]


# Computer Hardware
Regression numeric -> numeric

In [24]:
headers = ['Vendor Name', 'Model Name', 'MYCT', 'MMIN', 'MMAX', 'CACH', 'CHMIN', 'CHMAX', 'PRP', 'ERP']
target = 'PRP'
computer = ML_Data('data/computer+hardware/machine.data', headers)
display(computer.data.head())
train1s, train2s, kx2tests = computer.crossvalid_kx2(5)
computer.data.shape

Unnamed: 0,Vendor Name,Model Name,MYCT,MMIN,MMAX,CACH,CHMIN,CHMAX,PRP,ERP
0,adviser,32/60,125,256,6000,256,16,128,198,199
1,amdahl,470v/7,29,8000,32000,32,8,32,269,253
2,amdahl,470v/7a,29,8000,32000,32,8,32,220,253
3,amdahl,470v/7b,29,8000,32000,32,8,32,172,253
4,amdahl,470v/7c,29,8000,16000,32,8,16,132,132


(209, 10)

## Null Model

In [26]:
train1s, train2s, tests = computer.kx2_regress(target, 5)

Baseline StDev: 160.83073308779512
Mean Squared Error: 136.53197709334034


In [28]:
full_mse = []
pruned_mse = []
full_tree_size = []
pruned_tree_size = []

for i in range(5):
    train_features = train1s[i].drop(['Vendor Name', 'Model Name', 'ERP', target], axis = 1)
    train_labels = train1s[i][target]
    test_features = train2s[i].drop(['Vendor Name', 'Model Name', 'ERP', target], axis = 1)
    test_labels = train2s[i][target]
    prune_features = kx2tests[i].drop(['Vendor Name', 'Model Name', 'ERP', target], axis = 1)
    prune_labels = kx2tests[i][target]

    computer_tree = computer.Generate_Tree(train_features[[]], train_features, train_labels, theta = 0, classify=False)
    mse, preds = computer.evaluate_tree(computer_tree, test_features, test_labels, classify=False)
    full_mse.append(mse)
    pruned_tree = computer.iter_prune(computer_tree, prune_features, prune_labels, classify=False)
    mse, preds = computer.evaluate_tree(pruned_tree, test_features, test_labels, classify=False)
    pruned_mse.append(mse)
    full_tree_size.append(len(computer.vertices(computer_tree, location=[], node_list=[])))
    pruned_tree_size.append(len(computer.vertices(pruned_tree, location=[], node_list=[])))

results = pd.DataFrame({'Full tree mse': full_mse, 'Pruned tree mse': pruned_mse, 'Full tree parent nodes': full_tree_size, 'Pruned tree parent nodes': pruned_tree_size})
display(results)
print(results.mean())

Unnamed: 0,Full tree mse,Pruned tree mse,Full tree parent nodes,Pruned tree parent nodes
0,71.222259,70.605922,79,29
1,67.445679,65.138966,78,51
2,145.480955,145.448938,79,40
3,92.653797,88.675701,81,16
4,106.236988,107.715267,81,24


Full tree mse               96.607936
Pruned tree mse             95.516959
Full tree parent nodes      79.600000
Pruned tree parent nodes    32.000000
dtype: float64


In [29]:
print(f'full tree parent nodes: {len(computer.vertices(computer_tree, location=[], node_list=[]))}')
print(f'pruned tree parent nodes: {len(computer.vertices(pruned_tree, location=[], node_list=[]))}')

full tree parent nodes: 81
pruned tree parent nodes: 24


In [30]:
print(computer_tree.feature_name)
print(computer_tree.keys)

MMIN
[12000.0]


# Forest Fires
regression numeric -> numeric

In [3]:
target = 'area'
forestfires = ML_Data('data/forest+fires/forestfires.csv')
months = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
forestfires.data = forestfires.replace_categories(dict(zip(months, range(12))), columns = ['month'])
forestfires.data = forestfires.replace_categories(dict(zip(days, range(7))), columns = ['day'])
forestfires.data = forestfires.replace_missing('?', int_val= True, columns = ['X', 'Y', 'month', 'day'])
forestfires.data = forestfires.replace_missing('?', columns = ['FFMC', 'DMC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area'])
display(forestfires.data.head())
train1s, train2s, kx2tests = forestfires.crossvalid_kx2(5)
forestfires.data.shape

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,2,4,86.2,26.2,94.3,5.1,8.2,51.0,6.7,0.0,0.0
1,7,4,9,1,90.6,35.4,669.1,6.7,18.0,33.0,0.9,0.0,0.0
2,7,4,9,5,90.6,43.7,686.9,6.7,14.6,33.0,1.3,0.0,0.0
3,8,6,2,4,91.7,33.3,77.5,9.0,8.3,97.0,4.0,0.2,0.0
4,8,6,2,6,89.3,51.3,102.2,9.6,11.4,99.0,1.8,0.0,0.0


(517, 13)

## Null Model

In [4]:
train1s, train2s, tests = forestfires.kx2_regress(target, 5)

Baseline StDev: 63.65581846794089
Mean Squared Error: 64.51803112799931


In [5]:
full_mse = []
pruned_mse = []
full_tree_size = []
pruned_tree_size = []

for i in range(5):
    train_features = train1s[i].drop([target], axis = 1)
    train_labels = train1s[i][target]
    test_features = train2s[i].drop([target], axis = 1)
    test_labels = train2s[i][target]
    prune_features = kx2tests[i].drop([target], axis = 1)
    prune_labels = kx2tests[i][target]

    ff_tree = forestfires.Generate_Tree(train_features[[]], train_features, train_labels, theta = 0, classify=False)
    mse, preds = forestfires.evaluate_tree(ff_tree, test_features, test_labels, classify=False)
    full_mse.append(mse)
    pruned_tree = forestfires.iter_prune(ff_tree, prune_features, prune_labels, classify=False)
    mse, preds = forestfires.evaluate_tree(pruned_tree, test_features, test_labels, classify=False)
    pruned_mse.append(mse)
    full_tree_size.append(len(forestfires.vertices(ff_tree, location=[], node_list=[])))
    pruned_tree_size.append(len(forestfires.vertices(pruned_tree, location=[], node_list=[])))

results = pd.DataFrame({'Full tree mse': full_mse, 'Pruned tree mse': pruned_mse, 'Full tree parent nodes': full_tree_size, 'Pruned tree parent nodes': pruned_tree_size})
display(results)
print(results.mean())

Unnamed: 0,Full tree mse,Pruned tree mse,Full tree parent nodes,Pruned tree parent nodes
0,42.283231,29.805774,204,4
1,107.289243,93.094017,201,1
2,65.466333,61.304271,204,1
3,66.458085,61.779694,203,7
4,96.296139,92.894513,206,2


Full tree mse                75.558606
Pruned tree mse              67.775654
Full tree parent nodes      203.600000
Pruned tree parent nodes      3.000000
dtype: float64


In [None]:
print(f'full tree parent nodes: {len(forestfires.vertices(ff_tree, location=[], node_list=[]))}')
print(f'pruned tree parent nodes: {len(forestfires.vertices(pruned_tree, location=[], node_list=[]))}')

full tree parent nodes: 206
pruned tree parent nodes: 1


In [6]:
print(ff_tree.feature_name)
print(ff_tree.keys)

Y
[7.0]
