# Decision Tree

## Implementation

### Import libraries 
Let's first import the libraries, mainly `pandas`, `numpy` and Decision Tree implementation from `sklearn`:

In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

### Load results datasets

Now, load the original dataset, taking the targets apart from the features:

In [2]:
# Train dataset
dataset_train = pd.read_csv('datasets/covertype_norm_train.csv')
# Targets
target_train = dataset_train.iloc[:,-1]
# Dataset without classes
data_train = dataset_train.iloc[:,:-1]
data_train.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,-0.573753,-0.518424,-0.428658,0.436024,-0.475092,-0.979056,0.927864,0.14452,-0.534162,-0.220768,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
1,1.656009,-0.010549,0.868502,-0.516497,-0.280544,1.81761,0.862413,0.665801,-0.534162,2.273548,...,-0.14199,-0.214265,4.938531,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
2,0.169501,-0.799569,0.632655,0.45517,1.89191,-0.388051,0.796962,-1.245563,-1.335438,-0.687429,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
3,-1.205043,1.268208,1.576043,0.23499,1.648725,-0.649457,-2.933743,-0.15956,1.956291,-0.501856,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
4,-1.057345,0.152697,0.986425,0.134472,0.530073,-1.041945,0.404256,1.056762,-0.014415,-0.79477,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986


In [3]:
# Test dataset
dataset_test = pd.read_csv('datasets/covertype_norm_test.csv')
# Targets
target_test = dataset_test.iloc[:,-1]
# Dataset without classes
data_test = dataset_test.iloc[:,:-1]
data_test.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,-1.350358,1.730737,-0.782429,-0.889847,-0.783127,-0.407751,-0.381155,0.2314,0.5703,-0.35176,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
1,-0.857238,0.78754,1.104349,-0.253237,1.081293,-0.297127,-1.7229,1.360843,1.848011,-0.840253,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
2,0.422017,1.794222,0.632655,0.459957,0.3031,1.02733,-1.101116,-1.028362,0.332083,0.262267,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
3,-1.698163,1.277277,3.10905,-0.27717,1.373115,-1.150296,-4.733644,-1.549644,2.151197,-1.108606,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
4,-0.630926,1.81236,-0.782429,1.063061,0.416587,-0.907074,-0.282979,0.10108,0.440364,-0.429082,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986


Now, load the dataset with the results from the **genetic** and **pso** algorithm executions:

In [4]:
# Read selected attributes of PSO
ga  = pd.read_csv('results/ga_selected_attributes.csv')
# Read selected attributes of pso
pso  = pd.read_csv('results/pso_selected_attributes.csv')
# Read selected attributes of filtered
filtered  = pd.read_csv('results/filter_selected_attrs.csv')

### Decision Tree definition

In [6]:
def perform_decision_tree_selector(attr):
    '''
    Performs decision tree for a given dataset.
    '''
    columns = attr.columns.tolist()[:-54] + ['accuracy']
    new_df = pd.DataFrame(columns=columns)
    
    for index in range(len(attr)):
        vector = attr.iloc[index,-54:].tolist()
        sliced_train = data_train.iloc[:, vector]       
        sliced_test = data_test.iloc[:, vector]
        
        # Perform decision tree
        d_tree = DecisionTreeClassifier(random_state=0)
        d_tree.fit(sliced_train, target_train)
        new_df.loc[index] = attr.iloc[index,:-54].tolist() + [d_tree.score(sliced_test, target_test)]
    
    return new_df

def perform_decision_tree_extractor(train, test):
    '''
    Performs decision tree for a given dataset.
    '''
    
    train_target = train['cover_type']
    train_data   = train.loc[:, train.columns != 'cover_type']
    test_target  = test['cover_type']
    test_data    = test.loc[:, test.columns != 'cover_type']
    
    d_tree = DecisionTreeClassifier(random_state=0)
    d_tree.fit(train_data, train_target)
    
    return d_tree.score(test_data, test_target)

In [8]:
# Perform decision tree for PSO
dtree_pso_result = perform_decision_tree_selector(pso)

In [7]:
full_dataset = pd.DataFrame(columns=(['n_attr'] + data_train.columns.tolist()))
full_dataset.loc[0] = [54] + [True for i in range(54)]

# Perform decision tree for ORIGINAL DATA
dtree_original_result = perform_decision_tree_selector(full_dataset)

In [13]:
# Perform decision tree for GA
dtree_ga_result = perform_decision_tree_selector(ga)


In [9]:
# Perform decision tree for filtered
dtree_filtered_result = perform_decision_tree_selector(filtered)

In [10]:
# Perform decision tree for PCA
representativity = [75,85,95]
dtree_pca_result = pd.DataFrame(columns=['representativity (%)', 'accuracy'])
index = 0
for r in representativity:
    train = pd.read_csv('results/pca_' + str(r) + '.csv')
    test  = pd.read_csv('results/pca_test_' + str(r) + '.csv')
    dtree_pca_result.loc[index] = [r, perform_decision_tree_extractor(train, test)]
    index += 1
    
# Store results of PCA
dtree_pca_result.to_csv('results/decision_tree_pca.csv')

In [11]:
# Perform decision tree for LDA
n_components = range(1,7)
dtree_lda_result = pd.DataFrame(columns=(['n_components', 'accuracy']))
index = 0
for n in n_components:
    train = pd.read_csv('results/lda_' + str(n) + '.csv')
    test  = pd.read_csv('results/lda_test_' + str(n) + '.csv')
    dtree_lda_result.loc[index] = [int(n), perform_decision_tree_extractor(train, test)]
    index += 1
    
# Store results of LDA
dtree_lda_result.to_csv('results/decision_tree_lda.csv')

In [20]:
print("Index: ", dtree_original_result['accuracy'].idxmax(), "/ original: ", dtree_original_result['accuracy'].max())
print("Index: ", dtree_pso_result['accuracy'].idxmax(), "/ pso: ", dtree_pso_result['accuracy'].max())
print("Index: ", dtree_ga_result['accuracy'].idxmax(), "/ ga: ", dtree_ga_result['accuracy'].max())
print("Index: ", dtree_filtered_result['accuracy'].idxmax(), "/ filtered: ", dtree_filtered_result['accuracy'].max())
print("Index: ", dtree_pca_result['accuracy'].idxmax(), "/ pca: ", dtree_pca_result['accuracy'].max())
print("Index: ", dtree_lda_result['accuracy'].idxmax(), "/ lda: ", dtree_lda_result['accuracy'].max())

Index:  0 / original:  0.812603993344426
Index:  261 / pso:  0.8053244592346089
Index:  203 / ga:  0.752287853577371
Index:  2 / filtered:  0.783485856905158
Index:  1 / pca:  0.7735024958402662
Index:  5 / lda:  0.769134775374376


In [21]:
# Store results
dtree_pso_result.to_csv('results/decision_tree_pso.csv')
dtree_ga_result.to_csv('results/decision_tree_ga.csv')
dtree_filtered_result.to_csv('results/decision_tree_filtered.csv')
dtree_pca_result.to_csv('results/decision_tree_pca.csv')
dtree_lda_result.to_csv('results/decision_tree_lda.csv')

## Storing Trees

In [22]:
from sklearn.externals import joblib

def save_selector_tree(attr, name, index):
    vector = attr.iloc[index,-54:].tolist()
    sliced_train = data_train.iloc[:, vector]       
    sliced_test = data_test.iloc[:, vector]
        
    # Fit decision tree
    d_tree = DecisionTreeClassifier(random_state=0)
    d_tree.fit(sliced_train, target_train)
    
    # Dump model
    joblib.dump(d_tree, 'models/dtree_' + name + '.save')

def save_extractor_tree(train, test, name):
    
    train_target = train['cover_type']
    train_data   = train.loc[:, train.columns != 'cover_type']
    
    # Fit decision tree
    d_tree = DecisionTreeClassifier(random_state=0)
    d_tree.fit(train_data, train_target)
    
    # Dump model
    joblib.dump(d_tree, 'models/dtree_' + name + '.save')    

In [23]:
# Store trees of selector algorithms
save_selector_tree(full_dataset, 'original', dtree_original_result['accuracy'].idxmax())
save_selector_tree(pso, 'pso', dtree_pso_result['accuracy'].idxmax())
save_selector_tree(ga, 'ga', dtree_ga_result['accuracy'].idxmax())
save_selector_tree(filtered, 'filtered', dtree_filtered_result['accuracy'].idxmax())

# Store trees of especific selectors
save_selector_tree(pso, 'pso_selected', 216)
save_selector_tree(ga, 'ga_selected', 2429)

In [24]:
# Store trees of extractor algorithms
pca_train = pd.read_csv('results/pca_95.csv')
pca_test  = pd.read_csv('results/pca_test_95.csv')
save_extractor_tree(pca_train, pca_test, 'pca')

lda_train = pd.read_csv('results/lda_6.csv')
lda_test  = pd.read_csv('results/lda_test_6.csv')
save_extractor_tree(lda_train, lda_test, 'lda')

## Informations of trees

In [35]:
# Load models
original_model = joblib.load('models/dtree_original.save')
ga_model = joblib.load('models/dtree_ga.save')
ga_selected_model = joblib.load('models/dtree_ga_selected.save')
pso_model = joblib.load('models/dtree_pso.save')
pso_selected_model = joblib.load('models/dtree_pso_selected.save')
filtered_model = joblib.load('models/dtree_filtered.save')
pca_model = joblib.load('models/dtree_pca.save')
lda_model = joblib.load('models/dtree_lda.save')

In [28]:
def tree_info(model):
    tree = model.tree_
    nodes = tree.node_count    
    children_left = tree.children_left
    children_right = tree.children_right
    leaves = 0

    def walk(id):
        nonlocal leaves
        if (children_left[id] != children_right[id]):
            left_max = 1 + walk(children_left[id])
            right_max = 1 + walk(children_right[id])
            return max(left_max, right_max)
        else: # leaf
            leaves += 1
            return 1

    root_node_id = 0
    return [nodes, walk(root_node_id), leaves]
        

In [30]:
print('original_model: ', tree_info(original_model))
print('ga_model: ', tree_info(ga_model))
print('ga_selected_model: ', tree_info(ga_selected_model))
print('pso_model: ', tree_info(pso_model))
print('pso_selected_model: ', tree_info(pso_selected_model))
print('filtered_model: ', tree_info(filtered_model))
print('pca_model: ', tree_info(pca_model))
print('lda_model: ', tree_info(lda_model))

original_model:  [3855, 38, 1928]
ga_model:  [5809, 28, 2905]
ga_selected_model:  [3, 2, 2]
pso_model:  [4477, 29, 2239]
pso_selected_model:  [23, 8, 12]
filtered_model:  [4895, 37, 2448]
pca_model:  [3979, 31, 1990]
lda_model:  [4593, 27, 2297]


## Image of trees

In [32]:
from sklearn import tree
tree.export_graphviz(original_model, out_file='trees/original_model.dot')
tree.export_graphviz(ga_model, out_file='trees/ga_model.dot')
tree.export_graphviz(ga_selected_model, out_file='trees/ga_selected_model.dot')
tree.export_graphviz(pso_model, out_file='trees/pso_model.dot')
tree.export_graphviz(pso_selected_model, out_file='trees/pso_selected_model.dot')
tree.export_graphviz(filtered_model, out_file='trees/filtered_model.dot')
tree.export_graphviz(pca_model, out_file='trees/pca_model.dot')
tree.export_graphviz(lda_model, out_file='trees/lda_model.dot')

In [34]:
!dot -Tpng trees/original_model.dot -o trees/original_model.png
!dot -Tpng trees/ga_model.dot -o trees/ga_model.png
!dot -Tpng trees/ga_selected_model.dot -o trees/ga_selected_model.png
!dot -Tpng trees/pso_model.dot -o trees/pso_model.png
!dot -Tpng trees/pso_selected_model.dot -o trees/pso_selected_model.png
!dot -Tpng trees/filtered_model.dot -o trees/filtered_model.png
!dot -Tpng trees/pca_model.dot -o trees/pca_model.png
!dot -Tpng trees/lda_model.dot -o trees/lda_model.png

dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.192903 to fit
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.128427 to fit
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.167902 to fit
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.141599 to fit
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.210566 to fit
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.150227 to fit
