In [1]:
import sys
sys.path.append('../')
import numpy as np
import pandas as pd
from decision_tree import DecisionTree as DecisionTreeAdaBoost
from DecisionTree.decision_tree import DecisionTree as DecisionTreeOG
from AdaBoost import AdaBoostTree
from BaggedTrees import BaggedTrees, RandomForest

In [2]:
def test_tree_accuracy(decision_tree, test_data):
    preds = test_data.apply(lambda row : decision_tree.predict(row), axis=1)
    diff = preds == test_data['label']
    if (diff == True).all():
        return 0
    else:
        error_count = diff.value_counts()[False]
        return error_count / len(test_data)

In [3]:
def process_data(df, attributes, replace_unknown=False, map_labels=True):
    #If specified, replace all 'uknown' values with column majority
    if replace_unknown:
        for attribute in attributes:
            if df[attribute].dtype.kind not in 'iufc':
                most_common = 'unknown'
                counts = df[attribute].value_counts()
                if counts[[0]].index[0] == 'unknown' and len(counts) > 1:
                    most_common = counts[[1]].index[0]
                else:
                    most_common = counts[[0]].index[0]
                df[attribute][df[attribute] == 'unknown'] = most_common
    
    #Replace numerical columns with boolean values based on median threshold
    for attribute in attributes:
        if df[attribute].dtype.kind in 'iufc':
            median = df[attribute].median()
            binary_col = df[attribute] > median
            df[attribute] = binary_col

    if map_labels:
        df.label[df.label == 'yes'] = 1
        df.label[df.label == 'no'] = -1
            
    return df

In [4]:
attributes = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 
'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

df_train = pd.read_csv('../data/bank/train.csv', names=attributes + ['label'])
df_test = pd.read_csv('../data/bank/test.csv', names=attributes + ['label'])

df_train = process_data(df_train, attributes, replace_unknown=False)
df_test = process_data(df_test, attributes, replace_unknown=False)

In [9]:
#Test AdaBoost
training_errors = []
test_errors = []
T = np.arange(1, 505, 10)

for t in T:
    adaboost = AdaBoostTree(df_train, attributes)
    adaboost.build_model(t)
    training_errors.append(test_tree_accuracy(adaboost, df_train))
    test_errors.append(test_tree_accuracy(adaboost, df_test))


In [5]:
#Test Bagged Trees
training_errors = []
test_errors = []
T = [1, 3, 5, 10, 15, 20, 30, 40, 50, 75, 100, 150, 200, 250, 300, 400, 500]

for t in T:
    bagged_trees = BaggedTrees(df_train, attributes)
    bagged_trees.build_trees(t)
    training_errors.append(test_tree_accuracy(bagged_trees, df_train))
    test_errors.append(test_tree_accuracy(bagged_trees, df_test)) 


In [None]:
textfile = open("training_errs_bt.txt", "w")
for element in training_errors:
    textfile.write(str(element) + ", ")
textfile.close()

In [None]:
textfile = open("test_errs_bt.txt", "w")
for element in test_errors:
    textfile.write(str(element) + ", ")
textfile.close()

In [None]:
#Bagged Trees bias/variance decomposition
num_trees = 500
num_samples = 1000
bagged_predictors = []

for i in range(100):
    bagged_trees = BaggedTrees(df_train, attributes)
    bagged_trees.build_trees(num_trees, num_samples)
    bagged_predictors.append(bagged_trees)

single_trees = []
for predictor in bagged_predictors:
    single_trees.append(predictor.trees)

single_tree_preds

In [None]:
#Test Random Forest
training_errors_rf = {}
test_errors_rf = {}
T = [1, 3, 5, 10, 15, 20, 30, 40, 50, 75, 100, 150, 200, 250, 300, 400, 500]
subset_sizes = [2, 4, 6]

for subset_size in subset_sizes:
    training_errors_rf[subset_size] = []
    test_errors_rf[subset_size] = []
    for t in T:
        random_forest = RandomForest(df_train, attributes)
        random_forest.build_trees(t, subset_size)
        training_errors_rf[subset_size].append(test_tree_accuracy(random_forest, df_train))
        test_errors_rf[subset_size].append(test_tree_accuracy(random_forest, df_test))


In [None]:
import json

with open('training_errs_rf.txt', 'w') as convert_file:
     convert_file.write(json.dumps(training_errors_rf))

with open('test_errors_rf_rf.txt', 'w') as convert_file:
     convert_file.write(json.dumps(test_errors_rf))