In [1]:
import sys
sys.path.append('../')

In [2]:
import numpy as np
import pandas as pd
from decision_tree import DecisionTree as DecisionTreeAdaBoost
from decision_tree import DecisionTree
from AdaBoost import AdaBoostTree
from BaggedTrees import BaggedTrees, RandomForest

In [3]:
def test_tree_accuracy(decision_tree, test_data):
    preds = test_data.apply(lambda row : decision_tree.predict(row), axis=1)
    diff = preds == test_data['label']
    if (diff == True).all():
        return 0
    else:
        error_count = diff.value_counts()[False]
        return error_count / len(test_data)

In [4]:
def test_decision_tree(df_train, df_test, attributes, max_max_depth):
    purity_functions = ['entropy', 'gini', 'me']
    for max_depth in range(1, max_max_depth+1):
        for purity_function in purity_functions:
            tree = DecisionTree(df_train, attributes).build_tree(purity_type=purity_function, max_depth=max_depth)
            training_error = test_tree_accuracy(tree, df_train)
            testing_error = test_tree_accuracy(tree, df_test)
            print('Max Depth: %d | Purity Function: %s | Test Set: Training data | Error: %.3f' % (max_depth, purity_function, training_error))
            print('Max Depth: %d | Purity Function: %s | Test Set: Testing data | Error: %.3f' % (max_depth, purity_function, testing_error))

In [5]:
def process_data(df, attributes, replace_unknown=False, map_labels=True):
    #If specified, replace all 'uknown' values with column majority
    if replace_unknown:
        for attribute in attributes:
            if df[attribute].dtype.kind not in 'iufc':
                most_common = 'unknown'
                counts = df[attribute].value_counts()
                if counts[[0]].index[0] == 'unknown' and len(counts) > 1:
                    most_common = counts[[1]].index[0]
                else:
                    most_common = counts[[0]].index[0]
                df[attribute][df[attribute] == 'unknown'] = most_common
    
    #Replace numerical columns with boolean values based on median threshold
    for attribute in attributes:
        if df[attribute].dtype.kind in 'iufc':
            median = df[attribute].median()
            binary_col = df[attribute] > median
            df[attribute] = binary_col

    if map_labels:
        df.label[df.label == 'yes'] = 1
        df.label[df.label == 'no'] = -1
            
    return df

In [6]:
attributes = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 
'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

df_train = pd.read_csv('../data/bank/train.csv', names=attributes + ['label'])
df_test = pd.read_csv('../data/bank/test.csv', names=attributes + ['label'])

df_train = process_data(df_train, attributes, replace_unknown=False)
df_test = process_data(df_test, attributes, replace_unknown=False)

In [7]:
df_train

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,label
0,True,services,married,secondary,no,False,yes,no,unknown,False,may,False,False,False,False,unknown,-1
1,True,blue-collar,single,secondary,no,False,yes,yes,cellular,False,feb,True,False,False,False,unknown,-1
2,True,technician,married,secondary,no,True,no,yes,cellular,True,aug,True,False,True,True,success,1
3,True,admin.,married,tertiary,no,False,yes,no,cellular,False,jul,True,False,False,False,unknown,-1
4,False,management,single,tertiary,no,True,no,no,cellular,False,apr,False,False,False,False,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,True,technician,divorced,secondary,no,False,yes,no,cellular,True,may,False,False,True,True,other,-1
4996,True,management,divorced,tertiary,no,False,yes,no,telephone,True,oct,True,False,False,False,unknown,1
4997,False,blue-collar,married,secondary,no,False,yes,no,unknown,False,jun,True,False,False,False,unknown,-1
4998,False,management,single,tertiary,no,True,yes,no,unknown,False,may,False,False,False,False,unknown,-1


In [8]:
# weights = np.random.rand(df_train.shape[0])
# weights = weights / np.sum(weights)
# weights = 1 / df_train.shape[0]
# df_train['weights'] = weights

# atree = DecisionTreeAdaBoost(df_train, attributes).build_tree(purity_type='entropy', max_depth=1)
# atree.root_node.split_attribute

In [9]:
# df_train['preds'] = df_train.apply(lambda row : atree.predict(row), axis=1)
# error = test_tree_accuracy(atree, df_train)
# vote = 0.5 * np.log( (1-error) / error )
# x =  (df_train.label * df_train.preds)
# weights = weights * np.exp(-1 * vote * x.to_numpy(dtype='float64'))
# df_train['weights'] = weights


In [10]:
dtree = DecisionTreeAdaBoost(df_train, attributes).build_tree(purity_type='entropy', max_depth=1)
dtree.root_node.split_attribute

'duration'

In [11]:
test_tree_accuracy(dtree, df_test)

0.1248

In [12]:
adaboost = AdaBoostTree(df_train, attributes)
adaboost.build_model(200)

In [13]:
test_tree_accuracy(adaboost, df_test)

0.1248

In [15]:
adaboost.examples.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,label,weights
0,True,services,married,secondary,no,False,yes,no,unknown,False,may,False,False,False,False,unknown,-1,1.4536769999999998e-91
1,True,blue-collar,single,secondary,no,False,yes,yes,cellular,False,feb,True,False,False,False,unknown,-1,1.4536769999999998e-91
2,True,technician,married,secondary,no,True,no,yes,cellular,True,aug,True,False,True,True,success,1,3.0160379999999998e+72
3,True,admin.,married,tertiary,no,False,yes,no,cellular,False,jul,True,False,False,False,unknown,-1,1.4536769999999998e-91
4,False,management,single,tertiary,no,True,no,no,cellular,False,apr,False,False,False,False,unknown,1,2.751643e+83


In [16]:
for classifier in adaboost.classifiers:
    print(classifier.stump.root_node.split_attribute)
    print(test_tree_accuracy(classifier, df_test))

duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
duration
0.1248
contact
0.1248
contact
0.1248
job
0.1248
campaign
0.1248
contact
0.1248
pdays
0.1248
day
0.1248
education
0.1248
housing
0.1248
month
0.1226
housing
0.1248
day
0.1248
job
0.1248
job
0.1248
loan
0.1248
duration
0.1248
loan
0.1248
day
0.1248
campaign
0.1248
month
0.1226
default
0.1248
loan
0.1248
duration
0.1248
campaign
0.1248
duration
0.1248
contact
0.1248
marital
0.1248
age
0.1248
education
0.1248
housing
0.1248
balance
0.1248
contact
0.1248
marital
0.1248
loan
0.1248
job
0.1248
contact
0.1248
default
0.1248
default
0.1248
education
0.1248
poutcome
0.1166
contact
0.1248
balance
0.1248
poutcome
0.1166
loan
0.1248
education
0.1248
contact
0.1248
contact
0.1248
campaign
0.1248
job
0.1248
l

In [9]:
x = df_train.sample(1000)
x.index.unique()

Int64Index([3241, 3600, 3280, 2219,  111, 3614, 4866,  516, 4907, 1218,
            ...
            3220, 4075, 1225, 4712, 2162, 3778, 2366,  711, 4884, 3471],
           dtype='int64', length=1000)

In [8]:
bagged_trees = BaggedTrees(df_train, attributes)
bagged_trees.build_trees(10)

In [9]:
test_tree_accuracy(bagged_trees, df_test)

0.148

In [8]:
random_forest = RandomForest(df_train, attributes)
random_forest.build_trees(10, 4)

In [None]:
test_tree_accuracy(random_forest, df_test)