In [2]:
import numpy as np
import pandas as pd
from decision_tree import DecisionTree as DecisionTreeAdaBoost
from decision_tree import DecisionTree
from AdaBoost import AdaBoostTree
from BaggedTrees import BaggedTrees, RandomForest

In [3]:
def test_tree_accuracy(decision_tree, test_data):
    preds = test_data.apply(lambda row : decision_tree.predict(row), axis=1)
    diff = preds == test_data['label']
    if (diff == True).all():
        return 0
    else:
        error_count = diff.value_counts()[False]
        return error_count / len(test_data)

In [4]:
def process_data(df, attributes, replace_unknown=False, map_labels=True):
    #If specified, replace all 'uknown' values with column majority
    if replace_unknown:
        for attribute in attributes:
            if df[attribute].dtype.kind not in 'iufc':
                most_common = 'unknown'
                counts = df[attribute].value_counts()
                if counts[[0]].index[0] == 'unknown' and len(counts) > 1:
                    most_common = counts[[1]].index[0]
                else:
                    most_common = counts[[0]].index[0]
                df[attribute][df[attribute] == 'unknown'] = most_common
    
    #Replace numerical columns with boolean values based on median threshold
    for attribute in attributes:
        if df[attribute].dtype.kind in 'iufc':
            median = df[attribute].median()
            binary_col = df[attribute] > median
            df[attribute] = binary_col

    if map_labels:
        df.label[df.label == 'yes'] = 1
        df.label[df.label == 'no'] = -1
            
    return df

In [5]:
attributes = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 
'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

df_train = pd.read_csv('../data/bank/train.csv', names=attributes + ['label'])
df_test = pd.read_csv('../data/bank/test.csv', names=attributes + ['label'])

df_train = process_data(df_train, attributes, replace_unknown=False)
df_test = process_data(df_test, attributes, replace_unknown=False)

In [6]:
#Bagged Trees bias/variance decomposition
num_trees = 100
num_samples = 500
bagged_predictors = []

for i in range(3):
    bagged_trees = BaggedTrees(df_train, attributes, df_train)
    bagged_trees.build_trees(num_trees, num_samples)
    bagged_predictors.append(bagged_trees)

In [9]:
single_trees = [predictor.trees[0] for predictor in bagged_predictors]
single_tree_biases = []
single_tree_variances = []
ctr = 0
for idx, row in df_test.iterrows():
    preds = []
    for tree in single_trees:
        try:
            pred = tree.predict(row)
        except:
            ctr+=1
            continue
        preds.append(pred)
    preds = np.asarray(preds)
    avg_pred = np.mean(preds)
    bias = (avg_pred - row['label'])**2
    single_tree_biases.append(bias)
    var = np.var(preds)
    single_tree_variances.append(var)
single_tree_bias = sum(single_tree_biases) / len(single_tree_biases)
single_tree_var = sum(single_tree_variances) / len(single_tree_variances)
single_tree_squared_err = single_tree_bias + single_tree_var
print(ctr)

0


In [10]:
bagged_tree_biases = []
bagged_tree_variances = []
ctr = 0
for idx, row in df_test.iterrows():
    preds = []
    for predictor in bagged_predictors:
        try:
            pred = predictor.predict(row)
        except:
            ctr+=1
            continue
        preds.append(pred)
    preds = np.asarray(preds)
    avg_pred = np.mean(preds)
    bias = (avg_pred - row['label'])**2
    bagged_tree_biases.append(bias)
    var = np.var(preds)
    bagged_tree_variances.append(var)
print(ctr)
bagged_trees_bias = sum(bagged_tree_biases) / len(bagged_tree_biases)
bagged_trees_var = sum(bagged_tree_variances) / len(bagged_tree_variances)
bagged_trees_squared_err = bagged_trees_bias + bagged_trees_var

0


In [11]:
print('Single Trees Bias: %f' % single_tree_bias)
print('Single Trees Variance: %f' % single_tree_var)
print('Single Trees Estimated Squared Error: %f' % single_tree_squared_err)

print('Bagged Trees Bias: %f' % bagged_trees_bias)
print('Bagged Trees Variance: %f' % bagged_trees_var)
print('Bagged Trees Estimated Squared Error: %f' % bagged_trees_squared_err)

Single Trees Bias: 0.462400
Single Trees Variance: 0.274133
Single Trees Estimated Squared Error: 0.736533
Bagged Trees Bias: 0.430844
Bagged Trees Variance: 0.144356
Bagged Trees Estimated Squared Error: 0.575200


In [None]:
bagged_tree_biases = np.asarray(bagged_tree_biases)
bagged_tree_variances = np.asarray(bagged_tree_variances)

bagged_tree_biases = bagged_tree_biases[~np.isnan(bagged_tree_biases)]
bagged_tree_variances = bagged_tree_variances[~np.isnan(bagged_tree_variances)]

bagged_trees_bias = sum(bagged_tree_biases) / len(bagged_tree_biases)
bagged_trees_var = sum(bagged_tree_variances) / len(bagged_tree_variances)
bagged_trees_squared_err = bagged_trees_bias + bagged_trees_var