# Decision Tree Classifier

A Decision Tree Classifier learns a function and represents it by a decision tree. Each node in this tree
specifies a test of some attribute of the instance, and each branch descending from that node corresponds
to one of the possible values for this attribute.

The `sklearn` library provides an implementation via the `DecisionTreeClassifier` class, which uses 
the tree algorithm named CART, which constructs binary trees using the feature and threshold that yields
the largest information gain at each node.

Sections below show how this implementation was used in this work.


## Implementation

### Import libraries 
Let's first import the libraries, mainly `pandas`, `numpy` and Decision Tree implementation from `sklearn`:

In [4]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

### Load the datasets

Now, load the train and test datasets, taking the targets apart from the features:

In [5]:
# Train dataset
dataset_train = pd.read_csv('datasets/covertype_norm_train.csv')
# Targets
target_train = dataset_train.iloc[:,-1]
# Dataset without classes
data_train = dataset_train.iloc[:,:-1]
data_train.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,-0.573753,-0.518424,-0.428658,0.436024,-0.475092,-0.979056,0.927864,0.14452,-0.534162,-0.220768,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
1,1.656009,-0.010549,0.868502,-0.516497,-0.280544,1.81761,0.862413,0.665801,-0.534162,2.273548,...,-0.14199,-0.214265,4.938531,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
2,0.169501,-0.799569,0.632655,0.45517,1.89191,-0.388051,0.796962,-1.245563,-1.335438,-0.687429,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
3,-1.205043,1.268208,1.576043,0.23499,1.648725,-0.649457,-2.933743,-0.15956,1.956291,-0.501856,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
4,-1.057345,0.152697,0.986425,0.134472,0.530073,-1.041945,0.404256,1.056762,-0.014415,-0.79477,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986


In [6]:
# Test dataset
dataset_test = pd.read_csv('datasets/covertype_norm_test.csv')
# Targets
target_test = dataset_test.iloc[:,-1]
# Dataset without classes
data_test = dataset_test.iloc[:,:-1]
data_test.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,-1.350358,1.730737,-0.782429,-0.889847,-0.783127,-0.407751,-0.381155,0.2314,0.5703,-0.35176,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
1,-0.857238,0.78754,1.104349,-0.253237,1.081293,-0.297127,-1.7229,1.360843,1.848011,-0.840253,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
2,0.422017,1.794222,0.632655,0.459957,0.3031,1.02733,-1.101116,-1.028362,0.332083,0.262267,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
3,-1.698163,1.277277,3.10905,-0.27717,1.373115,-1.150296,-4.733644,-1.549644,2.151197,-1.108606,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
4,-0.630926,1.81236,-0.782429,1.063061,0.416587,-0.907074,-0.282979,0.10108,0.440364,-0.429082,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986


Now, load the dataset with the results from the GA, PSO and Correlation filter executions:

In [7]:
# Read selected attributes of PSO
ga  = pd.read_csv('results/ga_selected_attributes.csv')
# Read selected attributes of pso
pso  = pd.read_csv('results/pso_selected_attributes.csv')
# Read selected attributes of filtered
filtered  = pd.read_csv('results/filter_selected_attrs.csv')

PCA and LDA results will be loaded right before the model training.
### Decision Tree definition

Bellow is defined two functions to perform the classification using a decision tree. The first one is used by original dataset and the datasets of selector algorithms. The last is perfermed using the remain methods, the extractor algorithms *PCA* and *LDA*.

It was created two different functions because the selectors work with original attributes, and the extractors use new attributes with different dimensions. Therefore, it was necessary to use a different approach.

In [8]:
# Decision Tree for selector algorithms (PSO, GA and FILTERED)
def perform_decision_tree_selector(attr):
    '''
    Performs decision tree for a given dataset.
    '''
    # Definition of columns used in dataframe result
    columns = attr.columns.tolist()[:-54] + ['accuracy']
    # Definition of the used dataframe 
    new_df = pd.DataFrame(columns=columns)
    
    # Iterate over all items of attr
    for index in range(len(attr)):
        # Array of booleans to constrain the attributes to selected
        vector = attr.iloc[index,-54:].tolist()
        
        # New dataset for the traininig
        sliced_train = data_train.iloc[:, vector]
        # New dataset for the tests
        sliced_test = data_test.iloc[:, vector]
        
        # Perform decision tree
        d_tree = DecisionTreeClassifier(random_state=0)
        d_tree.fit(sliced_train, target_train)
        new_df.loc[index] = attr.iloc[index,:-54].tolist() + [d_tree.score(sliced_test, target_test)]
    
    return new_df

# knn for extractor algorithms (PCA, LDA)
def perform_decision_tree_extractor(train, test):
    '''
    Performs decision tree for a given dataset.
    '''
    # Split the datasets in data and target
    train_target = train['cover_type']
    train_data   = train.loc[:, train.columns != 'cover_type']
    test_target  = test['cover_type']
    test_data    = test.loc[:, test.columns != 'cover_type']
    
    d_tree = DecisionTreeClassifier(random_state=0)
    d_tree.fit(train_data, train_target)
    
    return d_tree.score(test_data, test_target)

### Decision tree execution

Decision tree will be executed for **all** solutions obtained with all possible parameter variations
of the algorithms used for dimensionality reduction. In fact, one solution was selected
for each of then, but all results are useful for further discussions.

Cells below show how such executions were performed:

In [8]:
# Perform decision tree for PSO
dtree_pso_result = perform_decision_tree_selector(pso)

In [9]:
full_dataset = pd.DataFrame(columns=(['n_attr'] + data_train.columns.tolist()))
full_dataset.loc[0] = [54] + [True for i in range(54)]

# Perform decision tree for ORIGINAL DATA
dtree_original_result = perform_decision_tree_selector(full_dataset)

In [13]:
# Perform decision tree for GA
dtree_ga_result = perform_decision_tree_selector(ga)


In [9]:
# Perform decision tree for filtered
dtree_filtered_result = perform_decision_tree_selector(filtered)

In [10]:
# Perform decision tree for PCA
representativity = [75,85,95]
dtree_pca_result = pd.DataFrame(columns=['representativity (%)', 'accuracy'])
index = 0
for r in representativity:
    train = pd.read_csv('results/pca_' + str(r) + '.csv')
    test  = pd.read_csv('results/pca_test_' + str(r) + '.csv')
    dtree_pca_result.loc[index] = [r, perform_decision_tree_extractor(train, test)]
    index += 1
    
# Store results of PCA
dtree_pca_result.to_csv('results/decision_tree_pca.csv')

In [11]:
# Perform decision tree for LDA
n_components = range(1,7)
dtree_lda_result = pd.DataFrame(columns=(['n_components', 'accuracy']))
index = 0
for n in n_components:
    train = pd.read_csv('results/lda_' + str(n) + '.csv')
    test  = pd.read_csv('results/lda_test_' + str(n) + '.csv')
    dtree_lda_result.loc[index] = [int(n), perform_decision_tree_extractor(train, test)]
    index += 1
    
# Store results of LDA
dtree_lda_result.to_csv('results/decision_tree_lda.csv')

## Presenting some results

Below is shown the results obtained with execution of decision tree over the datasets.

In [2]:
# Read all datasets
dtree_pso_result = pd.read_csv('results/decision_tree_pso.csv')
dtree_ga_result = pd.read_csv('results/decision_tree_ga.csv')
dtree_filtered_result = pd.read_csv('results/decision_tree_filtered.csv')
dtree_pca_result = pd.read_csv('results/decision_tree_pca.csv')
dtree_lda_result = pd.read_csv('results/decision_tree_lda.csv')

In [13]:
print("original: ", dtree_original_result['accuracy'].max())
print("pso: ", dtree_pso_result.loc[216,'accuracy'])
print("ga: ", dtree_ga_result.loc[321,'accuracy'])
print("filtered: ", dtree_filtered_result.loc[2,'accuracy'])
print("pca: ", dtree_pca_result.loc[2,'accuracy'])
print("lda: ", dtree_lda_result.loc[5,'accuracy'])

original:  0.812603993344426
pso:  0.3342346089850249
ga:  0.1855241264559068
filtered:  0.783485856905158
pca:  0.7622712146422629
lda:  0.769134775374376


## Storing Trees

Below, two functions were created to save the model of trees to be processed in the future.

In [14]:
from sklearn.externals import joblib

def save_selector_tree(attr, name, index):
    vector = attr.iloc[index,-54:].tolist()
    sliced_train = data_train.iloc[:, vector]       
    sliced_test = data_test.iloc[:, vector]
        
    # Fit decision tree
    d_tree = DecisionTreeClassifier(random_state=0)
    d_tree.fit(sliced_train, target_train)
    
    # Dump model
    joblib.dump(d_tree, 'models/dtree_' + name + '.save')

def save_extractor_tree(train, test, name):
    
    train_target = train['cover_type']
    train_data   = train.loc[:, train.columns != 'cover_type']
    
    # Fit decision tree
    d_tree = DecisionTreeClassifier(random_state=0)
    d_tree.fit(train_data, train_target)
    
    # Dump model
    joblib.dump(d_tree, 'models/dtree_' + name + '.save')    

### Execution of storage

Like before, two functions were create to differentiate the use of selector and extractor algorithms on performing the storage.

In [23]:
# Store trees of selector algorithms
save_selector_tree(full_dataset, 'original', dtree_original_result['accuracy'].idxmax())
save_selector_tree(pso, 'pso', dtree_pso_result['accuracy'].idxmax())
save_selector_tree(ga, 'ga', dtree_ga_result['accuracy'].idxmax())
save_selector_tree(filtered, 'filtered', dtree_filtered_result['accuracy'].idxmax())

# Store trees of especific selectors
save_selector_tree(pso, 'pso_selected', 216)
save_selector_tree(ga, 'ga_selected', 2429)

In [24]:
# Store trees of extractor algorithms
pca_train = pd.read_csv('results/pca_95.csv')
pca_test  = pd.read_csv('results/pca_test_95.csv')
save_extractor_tree(pca_train, pca_test, 'pca')

lda_train = pd.read_csv('results/lda_6.csv')
lda_test  = pd.read_csv('results/lda_test_6.csv')
save_extractor_tree(lda_train, lda_test, 'lda')

## Informations of trees

Now, informations about structure of trees is shown below.

In [35]:
# Load models
original_model = joblib.load('models/dtree_original.save')
ga_model = joblib.load('models/dtree_ga.save')
ga_selected_model = joblib.load('models/dtree_ga_selected.save')
pso_model = joblib.load('models/dtree_pso.save')
pso_selected_model = joblib.load('models/dtree_pso_selected.save')
filtered_model = joblib.load('models/dtree_filtered.save')
pca_model = joblib.load('models/dtree_pca.save')
lda_model = joblib.load('models/dtree_lda.save')

In [28]:
def tree_info(model):
    '''
    Obtain informations about the tree.
    '''
    tree = model.tree_
    nodes = tree.node_count    
    children_left = tree.children_left
    children_right = tree.children_right
    leaves = 0
    
    '''
    Perform the walk over tree.
    '''
    def walk(id):
        nonlocal leaves
        if (children_left[id] != children_right[id]):
            left_max = 1 + walk(children_left[id])
            right_max = 1 + walk(children_right[id])
            return max(left_max, right_max)
        else: # leaf
            leaves += 1
            return 1

    root_node_id = 0
    return [nodes, walk(root_node_id), leaves]
        

In [30]:
print('original_model: (nº nodes, height, leaves) ', tree_info(original_model))
print('ga_model: (nº nodes, height, leaves) ', tree_info(ga_model))
print('ga_selected_model: (nº nodes, height, leaves) ', tree_info(ga_selected_model))
print('pso_model: (nº nodes, height, leaves) ', tree_info(pso_model))
print('pso_selected_model: (nº nodes, height, leaves) ', tree_info(pso_selected_model))
print('filtered_model: (nº nodes, height, leaves) ', tree_info(filtered_model))
print('pca_model: (nº nodes, height, leaves) ', tree_info(pca_model))
print('lda_model: (nº nodes, height, leaves)', tree_info(lda_model))

original_model:  [3855, 38, 1928]
ga_model:  [5809, 28, 2905]
ga_selected_model:  [3, 2, 2]
pso_model:  [4477, 29, 2239]
pso_selected_model:  [23, 8, 12]
filtered_model:  [4895, 37, 2448]
pca_model:  [3979, 31, 1990]
lda_model:  [4593, 27, 2297]
