## Import and Helper Functions

In [24]:
import os
import sys
import time
import pandas as pd
from pathlib import Path
from git import Repo
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Set paths
repo_root = Path(Repo(".", search_parent_directories=True).working_tree_dir)
src_folder = repo_root.joinpath('src')

# Local imports
sys.path.append(src_folder.as_posix())
from decisiontree.DecisionTree import DecisionTree

# Other paths
data_path = repo_root.joinpath('data')


def accuracy(y_pred, y_val):
    """
    Calculating classifier accuracy. 
    
    Args:
        y_pred: Predicted labels.
        y_val: True labels.
    
    Returns:
        Prediction accuracy in percentage. 
    """
    
    true_pos = 0
    for c, val in enumerate(y_pred):
        if val == y_val.iloc[c]:
            true_pos += 1
    return (true_pos / len(y_pred))*100


def compare(X, y, X_test, imp_meas):
    """
    Comparing implemented decision tree classifier with scikit-learns default decision tree classifier.
    
    Args:
        X: Feature data for training.
        y: Label data for training.
        X_test: Feature data for testing.. 
        imp_meas: Impurity measure used in decision tree.
    
    Returns:
        Pandas Series object containin predictions from both decision trees.
    """
    
    sk_dt = DecisionTreeClassifier(criterion=imp_meas)
    sk_dt.fit(X,y)
    return pd.Series(sk_dt.predict(X_test))

## Build decision trees and calculate metrics

In [25]:
# Import dataset
data = pd.read_csv(os.path.join(data_path, 'data_banknote_authentication.csv'), header=None)

# Divide dataset into feature and label set
X = data[data.keys()[:-1]]
y = data[data.keys()[-1]]

# Split dataset into training and test set
X, X_test, y, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Train and test trees with both impurity measures and with/without pruning
for imp_meas in ['entropy', 'gini']:
    for pr in [True, False]:
        # Timer
        tic = time.perf_counter()
        
        dt = DecisionTree()
        dt.learn(X, y, impurity_measure=imp_meas, prune=pr)
        pred = pd.Series(dt.predict(X_test))
        
        toc = time.perf_counter()

        print('\nImpurity measure: {}, Error-reduced pruning: {}'.format(imp_meas, pr))
        print('Time consumption: {:0.4f} seconds'.format((toc-tic)))
        print('Accuracy: {:.2f}%'.format(accuracy(pred, y_test)))
        if not pr:
            cmp = compare(X, y, X_test, imp_meas)
            print('sklearn accuracy: {:.2f}%'.format(accuracy(cmp, y_test)))


Impurity measure: entropy, Error-reduced pruning: True
Time consumption: 0.3604 seconds
Accuracy: 86.65%

Impurity measure: entropy, Error-reduced pruning: False
Time consumption: 0.4140 seconds
Accuracy: 99.03%
sklearn accuracy: 98.79%

Impurity measure: gini, Error-reduced pruning: True
Time consumption: 0.3719 seconds
Accuracy: 86.65%

Impurity measure: gini, Error-reduced pruning: False
Time consumption: 0.3666 seconds
Accuracy: 99.03%
sklearn accuracy: 98.79%
