In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

In [13]:
# Train datasets
original_train = pd.read_csv('../datasets/covertype_norm_train.csv')
lda_train = pd.read_csv('../datasets/covertype_lda_train.csv')

# Targets
target_original_train = original_train.iloc[:,-1]
target_lda_train = lda_train.iloc[:,-1]

# Dataset without classes
data_original_train = original_train.iloc[:,:-1]
data_lda_train = lda_train.iloc[:,:-1]

In [14]:
# Test datasets
original_test = pd.read_csv('../datasets/covertype_norm_test.csv')
lda_test = pd.read_csv('../datasets/covertype_lda_test.csv')

# Targets
target_original_test = original_test.iloc[:,-1]
target_lda_test = lda_test.iloc[:,-1]

# Dataset without classes
data_original_test = original_test.iloc[:,:-1]
data_lda_test = lda_test.iloc[:,:-1]

In [106]:
def perform_decision_tree(train, test):
    '''
    Performs decision tree for a given dataset.
    '''
    # Split the datasets in data and target
    train_target = train['cover_type']
    train_data   = train.loc[:, train.columns != 'cover_type']
    test_target  = test['cover_type']
    test_data    = test.loc[:, test.columns != 'cover_type']
    
    d_tree = DecisionTreeClassifier(random_state=0)    
    result = cross_validate(d_tree, train_data, train_target, cv=10, return_estimator=True)
    
    acc_best = 0
    best_estimator = result['estimator'][0]

    for estimator in result['estimator']:
        score = estimator.score(test_data, test_target)
        if score > acc_best:
            acc_best = score
            best_estimator = estimator
            
    return [acc_best, best_estimator]

def check_tree_score(model, test):
    test_target  = test['cover_type']
    test_data    = test.loc[:, test.columns != 'cover_type']
    return model.score(test_data, test_target)

In [107]:
result_original = perform_decision_tree(original_train, original_test)
result_lda = perform_decision_tree(lda_train, lda_test)

In [108]:
print("Original: ", result_original[0])
print("LDA: ", result_lda[0])

Original:  0.8136439267886856
LDA:  0.7680948419301165


## Post-prune

In [109]:
from sklearn.tree._tree import TREE_LEAF

def prune_index(inner_tree, index, threshold):
    if inner_tree.value[index].min() < threshold:
        # turn node into a leaf by "unlinking" its children
        inner_tree.children_left[index] = TREE_LEAF
        inner_tree.children_right[index] = TREE_LEAF
    # if there are children, visit them as well
    if inner_tree.children_left[index] != TREE_LEAF:
        prune_index(inner_tree, inner_tree.children_left[index], threshold)
        prune_index(inner_tree, inner_tree.children_right[index], threshold)

In [110]:
print(sum(result_original[1].tree_.children_left < 0))
prune_index(result_original[1].tree_, 0, 5)
print(sum(result_original[1].tree_.children_left < 0))

1770
1772


In [111]:
print(sum(result_lda[1].tree_.children_left < 0))
prune_index(result_lda[1].tree_, 0, 5)
print(sum(result_lda[1].tree_.children_left < 0))

2116
2118


In [112]:
def tree_info(model):
    '''
    Obtain informations about the tree.
    '''
    tree = model.tree_
    nodes = tree.node_count    
    children_left = tree.children_left
    children_right = tree.children_right
    leaves = 0
    
    '''
    Perform the walk over tree.
    '''
    def walk(id):
        nonlocal leaves
        if (children_left[id] != children_right[id]):
            left_max = 1 + walk(children_left[id])
            right_max = 1 + walk(children_right[id])
            return max(left_max, right_max)
        else: # leaf
            leaves += 1
            return 1

    root_node_id = 0
    return [nodes, walk(root_node_id), leaves]
        

In [113]:
print(tree_info(result_original[1]))
print(tree_info(result_lda[1]))
print(check_tree_score(result_original[1], original_test))
print(check_tree_score(result_lda[1], lda_test))

[3539, 2, 2]
[4231, 2, 2]
0.28577371048252914
0.28577371048252914
