In [None]:
import pandas as pd
import time
import dataset
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from interpretableai import iai

In [None]:
# Datasets
# datasets = ['balance-scale', 'breast-cancer', 'car-evaluation', 'hayes-roth', 'house-votes-84',
#             'soybean-small', 'spect', 'tic-tac-toe', 'monks-1', 'monks-2', 'monks-3']

datasets = ['nath-jones', 'balance-scale', 'car-evaluation']

In [None]:
# Train and test variables
# depth = [2, 3, 4, 5]
depth = [4]
train_ratio = 0.5
val_ratio = 0.25
test_ratio = 0.25
seeds = [42]

oct_criterion = 'misclassification'
plot = False

In [49]:
res_cart = pd.DataFrame(columns=['instance', 'max-depth', 'seed', 'train_acc', 'val_acc', 'test_acc', 'train_time',
                                'num_nodes', 'depth'])

res_oct = pd.DataFrame(columns=['instance', 'max-depth', 'seed', 'train_acc', 'val_acc', 'test_acc', 'train_time',
                                'num_nodes', 'depth'])
for data in datasets:
    x, y = dataset.loadData(data)
    for d in depth:
        for s in seeds:
            print('\n\nCART depth', d, 'on', data)
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=1-train_ratio, random_state=s)
            x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, 
                                                                test_size=test_ratio/(test_ratio+val_ratio), random_state=s)
            clf = tree.DecisionTreeClassifier(max_depth=d)
            
            start = time.time()
            clf = clf.fit(x_train, y_train)
            end = time.time()
            train_acc_cart = accuracy_score(y_train, clf.predict(x_train))
            val_acc_cart = accuracy_score(y_val, clf.predict(x_val))
            test_acc_cart = accuracy_score(y_test, clf.predict(x_test))
            n_nodes = clf.tree_.node_count
            print(data, 'cart-d{}'.format(d), 'train acc:', train_acc_cart, 'val acc:', val_acc_cart, 'test acc:', test_acc_cart, 'num nodes:', n_nodes)
            
            row_cart = {'instance': data, 'max-depth': d, 'seed': s, 'train_acc': train_acc_cart, 'val_acc': val_acc_cart,
                   'test_acc': test_acc_cart, 'train_time': end-start, 'num_nodes': n_nodes}
            res_cart = pd.concat([res_cart, pd.DataFrame([row_cart])], ignore_index=True)

            print('\n\nOCT depth', d, 'on', data)
            
            (train_X, train_y), (test_X, test_y) = iai.split_data('classification', x, y,
                                                                  train_proportion=train_ratio,
                                                                  seed=s)
            (val_x, val_y), (test_X, test_y) = iai.split_data('classification', test_X, test_y,
                                                              train_proportion=1-test_ratio / (test_ratio + val_ratio),
                                                              seed=s)
            
            grid = iai.GridSearch(
                iai.OptimalTreeClassifier(
                    criterion=oct_criterion,
                    random_seed=1,
                ),
                max_depth=range(1, d+1),
            )
            start = time.time()
            grid.fit(train_X, train_y)
            end = time.time()
            best_model = grid.get_learner()
            train_acc = best_model.score(train_X, train_y)
            val_acc = best_model.score(val_x, val_y)
            test_acc = best_model.score(test_X, test_y)
            num_nodes = best_model.get_num_nodes()
            depth_oct = best_model.get_depth(num_nodes)
            print(data, 'oct-d{}'.format(d), 'train acc:', train_acc, 'val acc:', val_acc, 'test acc:', test_acc, 'num nodes:', num_nodes, 'depth:', depth_oct)
            
            row_oct = {'instance': data, 'max-depth': d, 'seed': s, 'train_acc': train_acc, 'val_acc': val_acc,
                   'test_acc': test_acc, 'train_time': end-start, 'num_nodes': num_nodes, 'depth': depth_oct}
            res_oct = pd.concat([res_oct, pd.DataFrame([row_oct])], ignore_index=True)
            
            if plot:
                # plot the decision tree
                plt.figure(figsize=(10, 8))
                tree.plot_tree(clf, fontsize=8)
                plt.show()
                
                plot = grid.get_learner().TreePlot()
                plot.show_in_browser()

res_cart.to_csv('./res/cart.csv', index=False)
res_oct.to_csv('./res/oct.csv', index=False)



CART depth 4 on nath-jones
nath-jones cart-d4 train acc: 1.0 val acc: 0.7272727272727273 test acc: 0.6666666666666666 num nodes: 9


OCT depth 4 on nath-jones


  res_cart = pd.concat([res_cart, pd.DataFrame([row_cart])], ignore_index=True)


nath-jones oct-d4 train acc: 0.9545454545454546 val acc: 0.8333333333333334 test acc: 0.75 num nodes: 3 depth: 1


CART depth 4 on balance-scale
balance-scale cart-d4 train acc: 0.8301282051282052 val acc: 0.782051282051282 test acc: 0.7452229299363057 num nodes: 31


OCT depth 4 on balance-scale


  res_oct = pd.concat([res_oct, pd.DataFrame([row_oct])], ignore_index=True)


balance-scale oct-d4 train acc: 0.858974358974359 val acc: 0.7948717948717949 test acc: 0.7324840764331211 num nodes: 17 depth: 4


CART depth 4 on car-evaluation
car-evaluation cart-d4 train acc: 0.8599537037037037 val acc: 0.8819444444444444 test acc: 0.8472222222222222 num nodes: 11


OCT depth 4 on car-evaluation
car-evaluation oct-d4 train acc: 0.8713789107763615 val acc: 0.8657407407407407 test acc: 0.8406466512702079 num nodes: 11 depth: 4
