In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import math
from decision_tree import DecisionTree
from sklearn import tree

In [39]:
def test_tree_accuracy(decision_tree, test_data):
    preds = test_data.apply(lambda row : decision_tree.predict(row), axis=1)
    diff = preds == test_data['label']
    if (diff == True).all():
        return 0
    else:
        error_count = diff.value_counts()[False]
        return error_count / len(test_data)

In [4]:
def process_data(df, attributes, replace_unknown=False):
    #If specified, replace all 'uknown' values with column majority
    if replace_unknown:
        for attribute in attributes:
            if df[attribute].dtype.kind not in 'iufc':
                most_common = 'unknown'
                counts = df[attribute].value_counts()
                if counts[[0]].index[0] == 'unknown' and len(counts) > 1:
                    most_common = counts[[1]].index[0]
                else:
                    most_common = counts[[0]].index[0]
                df[attribute][df[attribute] == 'unknown'] = most_common
    
    #Replace numerical columns with boolean values based on median threshold
    for attribute in attributes:
        if df[attribute].dtype.kind in 'iufc':
            median = df[attribute].median()
            binary_col = df[attribute] > median
            df[attribute] = binary_col
            
    return df

In [40]:
from decision_tree import DecisionTree

with open ( '../data/cars/data-desc.txt' , 'r' ) as f:
    desc_lines = f.readlines()

attributes = desc_lines[-1].strip().split(',')
attributes = attributes[:-1]

df_train = pd.read_csv('../data/cars/train.csv', names=attributes + ['label'])
df_test = pd.read_csv('../data/cars/test.csv', names=attributes + ['label'])

In [45]:
def test_decision_tree(df_train, df_test, attributes, max_max_depth):
    purity_functions = ['entropy', 'gini', 'me']
    for max_depth in range(1, max_max_depth+1):
        for purity_function in purity_functions:
            tree = DecisionTree(df_train, attributes).build_tree(purity_type=purity_function, max_depth=max_depth)
            training_error = test_tree_accuracy(tree, df_train)
            testing_error = test_tree_accuracy(tree, df_test)
            print('Max Depth: %d | Purity Function: %s | Test Set: Training data | Error: %.3f' % (max_depth, purity_function, training_error))
            print('Max Depth: %d | Purity Function: %s | Test Set: Testing data | Error: %.3f' % (max_depth, purity_function, testing_error))

In [25]:
decision_tree = DecisionTree(df_train, attributes).build_tree(purity_type='entropy', max_depth=6)
decision_tree

<decision_tree.DecisionTree at 0x1dd306f11f0>

In [26]:
test_tree_accuracy(decision_tree, df_test)

0.08791208791208792

In [46]:
test_decision_tree(df_train, df_test, attributes, 6)

Max Depth: 1 | Purity Function: entropy | Test Set: Training data | Error: 0.302
Max Depth: 1 | Purity Function: entropy | Test Set: Testing data | Error: 0.297
Max Depth: 1 | Purity Function: gini | Test Set: Training data | Error: 0.302
Max Depth: 1 | Purity Function: gini | Test Set: Testing data | Error: 0.297
Max Depth: 1 | Purity Function: me | Test Set: Training data | Error: 0.302
Max Depth: 1 | Purity Function: me | Test Set: Testing data | Error: 0.297
Max Depth: 2 | Purity Function: entropy | Test Set: Training data | Error: 0.222
Max Depth: 2 | Purity Function: entropy | Test Set: Testing data | Error: 0.223
Max Depth: 2 | Purity Function: gini | Test Set: Training data | Error: 0.222
Max Depth: 2 | Purity Function: gini | Test Set: Testing data | Error: 0.223
Max Depth: 2 | Purity Function: me | Test Set: Training data | Error: 0.292
Max Depth: 2 | Purity Function: me | Test Set: Testing data | Error: 0.313
Max Depth: 3 | Purity Function: entropy | Test Set: Training data 

In [53]:
for col in df_train.columns:
    df_train[col] = df_train[col].astype('category')

cat_columns = df_train.select_dtypes(['category']).columns
df_train[cat_columns] = df_train[cat_columns].apply(lambda x: x.cat.codes)

for col in df_test.columns:
    df_test[col] = df_test[col].astype('category')

cat_columns = df_test.select_dtypes(['category']).columns
df_test[cat_columns] = df_test[cat_columns].apply(lambda x: x.cat.codes)

In [55]:
X = df_train.loc[:, df_train.columns != 'label']
y = df_train['label']

clf = tree.DecisionTreeClassifier().fit(X, y)
preds = clf.predict(df_train.loc[:, df_train.columns != 'label'])

In [57]:
diff = pd.Series(preds) == df_train['label']
if (diff == True).all():
    print(1.0)
else:
    error_count = diff.value_counts()[False]
    print( 1 - (error_count / len(preds)))

1.0


In [55]:
attributes = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 
'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']

df_train = pd.read_csv('../data/bank/train.csv', names=attributes + ['label'])
df_test = pd.read_csv('../data/bank/test.csv', names=attributes + ['label'])

In [56]:
df_train = process_data(df_train, attributes, replace_unknown=True)
df_test = process_data(df_test, attributes, replace_unknown=True)

In [52]:
test_decision_tree(df_train, df_test, attributes, 16)

Max Depth: 1 | Purity Function: entropy | Test Set: Training data | Error: 0.119
Max Depth: 1 | Purity Function: entropy | Test Set: Testing data | Error: 0.125
Max Depth: 1 | Purity Function: gini | Test Set: Training data | Error: 0.109
Max Depth: 1 | Purity Function: gini | Test Set: Testing data | Error: 0.117
Max Depth: 1 | Purity Function: me | Test Set: Training data | Error: 0.109
Max Depth: 1 | Purity Function: me | Test Set: Testing data | Error: 0.117
Max Depth: 2 | Purity Function: entropy | Test Set: Training data | Error: 0.106
Max Depth: 2 | Purity Function: entropy | Test Set: Testing data | Error: 0.111
Max Depth: 2 | Purity Function: gini | Test Set: Training data | Error: 0.104
Max Depth: 2 | Purity Function: gini | Test Set: Testing data | Error: 0.109
Max Depth: 2 | Purity Function: me | Test Set: Training data | Error: 0.104
Max Depth: 2 | Purity Function: me | Test Set: Testing data | Error: 0.109
Max Depth: 3 | Purity Function: entropy | Test Set: Training data 

In [None]:
test_decision_tree(df_train, df_test, attributes, 16)


In [72]:
decision_tree = DecisionTree(df_train, attributes)
decision_tree.build_tree(purity_type='gini', max_depth=10000)


In [73]:
test_tree_accuracy(decision_tree, df_test)

0.8484

In [75]:
for col in df_train.columns:
    df_train[col] = df_train[col].astype('category')

cat_columns = df_train.select_dtypes(['category']).columns
df_train[cat_columns] = df_train[cat_columns].apply(lambda x: x.cat.codes)

for col in df_test.columns:
    df_test[col] = df_test[col].astype('category')

cat_columns = df_test.select_dtypes(['category']).columns
df_test[cat_columns] = df_test[cat_columns].apply(lambda x: x.cat.codes)

In [76]:
X = df_train.loc[:, df_train.columns != 'label']
y = df_train['label']

clf = tree.DecisionTreeClassifier().fit(X, y)
preds = clf.predict(df_train.loc[:, df_train.columns != 'label'])

In [77]:
diff = pd.Series(preds) == df_train['label']
if (diff == True).all():
    print(1.0)
else:
    error_count = diff.value_counts()[False]
    print( 1 - (error_count / len(preds)))

0.982
