# Decision Tree

## Implementation

### Import libraries 
Let's first import the libraries, mainly `pandas`, `numpy` and Decision Tree implementation from `sklearn`:

In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

### Load results datasets

Now, load the original dataset, taking the targets apart from the features:

In [3]:
# Train dataset
dataset_train = pd.read_csv('datasets/covertype_norm_train.csv')
# Targets
target_train = dataset_train.iloc[:,-1]
# Dataset without classes
data_train = dataset_train.iloc[:,:-1]
data_train.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,-0.573753,-0.518424,-0.428658,0.436024,-0.475092,-0.979056,0.927864,0.14452,-0.534162,-0.220768,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
1,1.656009,-0.010549,0.868502,-0.516497,-0.280544,1.81761,0.862413,0.665801,-0.534162,2.273548,...,-0.14199,-0.214265,4.938531,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
2,0.169501,-0.799569,0.632655,0.45517,1.89191,-0.388051,0.796962,-1.245563,-1.335438,-0.687429,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
3,-1.205043,1.268208,1.576043,0.23499,1.648725,-0.649457,-2.933743,-0.15956,1.956291,-0.501856,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
4,-1.057345,0.152697,0.986425,0.134472,0.530073,-1.041945,0.404256,1.056762,-0.014415,-0.79477,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986


In [4]:
# Test dataset
dataset_test = pd.read_csv('datasets/covertype_norm_test.csv')
# Targets
target_test = dataset_test.iloc[:,-1]
# Dataset without classes
data_test = dataset_test.iloc[:,:-1]
data_test.head()

Unnamed: 0,elevation,aspect,slope,horiz_dist_hydro,vert_dist_hydro,horiz_dist_road,hillshade_9,hill_shade_noon,hill_shade_15,horiz_dist_fire,...,soil_type_30,soil_type_31,soil_type_32,soil_type_33,soil_type_34,soil_type_35,soil_type_36,soil_type_37,soil_type_38,soil_type_39
0,-1.350358,1.730737,-0.782429,-0.889847,-0.783127,-0.407751,-0.381155,0.2314,0.5703,-0.35176,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
1,-0.857238,0.78754,1.104349,-0.253237,1.081293,-0.297127,-1.7229,1.360843,1.848011,-0.840253,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
2,0.422017,1.794222,0.632655,0.459957,0.3031,1.02733,-1.101116,-1.028362,0.332083,0.262267,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
3,-1.698163,1.277277,3.10905,-0.27717,1.373115,-1.150296,-4.733644,-1.549644,2.151197,-1.108606,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986
4,-0.630926,1.81236,-0.782429,1.063061,0.416587,-0.907074,-0.282979,0.10108,0.440364,-0.429082,...,-0.14199,-0.214265,-0.202489,-0.039088,-0.081433,-0.016657,-0.044107,-0.220216,-0.219696,-0.172986


Now, load the dataset with the results from the **genetic** and **pso** algorithm executions:

In [12]:
# Read selected attributes of PSO
ga  = pd.read_csv('results/ga_selected_attributes.csv')
# Read selected attributes of pso
pso  = pd.read_csv('results/pso_selected_attributes.csv')
# Read selected attributes of filtered
filtered  = pd.read_csv('results/filter_selected_attrs.csv')

### Decision Tree definition

In [8]:
def perform_decision_tree(attr):
    '''
    Performs decision tree for a given dataset.
    '''
    columns = attr.columns.tolist()[:-54] + ['accuracy']
    new_df = pd.DataFrame(columns=columns)
    
    for index in range(len(attr)):
        vector = attr.iloc[index,-54:].tolist()
        sliced_train = data_train.iloc[:, vector]       
        sliced_test = data_test.iloc[:, vector]
        
        # Perform decision tree
        d_tree = DecisionTreeClassifier(random_state=0)
        d_tree.fit(sliced_train, target_train)
        new_df.loc[index] = attr.iloc[0,:-54].tolist() + [d_tree.score(sliced_test, target_test)]
        print(index,',', end='')
    
    return new_df

In [9]:
# Perform decision tree for PSO
dtree_pso_result = perform_decision_tree(pso)

0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30 ,31 ,32 ,33 ,34 ,35 ,36 ,37 ,38 ,39 ,40 ,41 ,42 ,43 ,44 ,45 ,46 ,47 ,48 ,49 ,50 ,51 ,52 ,53 ,54 ,55 ,56 ,57 ,58 ,59 ,60 ,61 ,62 ,63 ,64 ,65 ,66 ,67 ,68 ,69 ,70 ,71 ,72 ,73 ,74 ,75 ,76 ,77 ,78 ,79 ,80 ,81 ,82 ,83 ,84 ,85 ,86 ,87 ,88 ,89 ,90 ,91 ,92 ,93 ,94 ,95 ,96 ,97 ,98 ,99 ,100 ,101 ,102 ,103 ,104 ,105 ,106 ,107 ,108 ,109 ,110 ,111 ,112 ,113 ,114 ,115 ,116 ,117 ,118 ,119 ,120 ,121 ,122 ,123 ,124 ,125 ,126 ,127 ,128 ,129 ,130 ,131 ,132 ,133 ,134 ,135 ,136 ,137 ,138 ,139 ,140 ,141 ,142 ,143 ,144 ,145 ,146 ,147 ,148 ,149 ,150 ,151 ,152 ,153 ,154 ,155 ,156 ,157 ,158 ,159 ,160 ,161 ,162 ,163 ,164 ,165 ,166 ,167 ,168 ,169 ,170 ,171 ,172 ,173 ,174 ,175 ,176 ,177 ,178 ,179 ,180 ,181 ,182 ,183 ,184 ,185 ,186 ,187 ,188 ,189 ,190 ,191 ,192 ,193 ,194 ,195 ,196 ,197 ,198 ,199 ,200 ,201 ,202 ,203 ,204 ,205 ,206 ,207 ,208 ,209 ,210 ,211 ,212 ,213 ,214 ,215 ,216 ,217 ,218 ,219 ,220 ,221 ,

In [15]:
full_dataset = pd.DataFrame(columns=(['n_attr'] + data_train.columns.tolist()))
full_dataset.loc[0] = [54] + [True for i in range(54)]

# Perform decision tree for ORIGINAL DATA
dtree_original_result = perform_decision_tree(full_dataset)

0 ,

In [13]:
# Perform decision tree for GA
dtree_ga_result = perform_decision_tree(ga)


0 ,1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 ,11 ,12 ,13 ,14 ,15 ,16 ,17 ,18 ,19 ,20 ,21 ,22 ,23 ,24 ,25 ,26 ,27 ,28 ,29 ,30 ,31 ,32 ,33 ,34 ,35 ,36 ,37 ,38 ,39 ,40 ,41 ,42 ,43 ,44 ,45 ,46 ,47 ,48 ,49 ,50 ,51 ,52 ,53 ,54 ,55 ,56 ,57 ,58 ,59 ,60 ,61 ,62 ,63 ,64 ,65 ,66 ,67 ,68 ,69 ,70 ,71 ,72 ,73 ,74 ,75 ,76 ,77 ,78 ,79 ,80 ,81 ,82 ,83 ,84 ,85 ,86 ,87 ,88 ,89 ,90 ,91 ,92 ,93 ,94 ,95 ,96 ,97 ,98 ,99 ,100 ,101 ,102 ,103 ,104 ,105 ,106 ,107 ,108 ,109 ,110 ,111 ,112 ,113 ,114 ,115 ,116 ,117 ,118 ,119 ,120 ,121 ,122 ,123 ,124 ,125 ,126 ,127 ,128 ,129 ,130 ,131 ,132 ,133 ,134 ,135 ,136 ,137 ,138 ,139 ,140 ,141 ,142 ,143 ,144 ,145 ,146 ,147 ,148 ,149 ,150 ,151 ,152 ,153 ,154 ,155 ,156 ,157 ,158 ,159 ,160 ,161 ,162 ,163 ,164 ,165 ,166 ,167 ,168 ,169 ,170 ,171 ,172 ,173 ,174 ,175 ,176 ,177 ,178 ,179 ,180 ,181 ,182 ,183 ,184 ,185 ,186 ,187 ,188 ,189 ,190 ,191 ,192 ,193 ,194 ,195 ,196 ,197 ,198 ,199 ,200 ,201 ,202 ,203 ,204 ,205 ,206 ,207 ,208 ,209 ,210 ,211 ,212 ,213 ,214 ,215 ,216 ,217 ,218 ,219 ,220 ,221 ,

1558 ,1559 ,1560 ,1561 ,1562 ,1563 ,1564 ,1565 ,1566 ,1567 ,1568 ,1569 ,1570 ,1571 ,1572 ,1573 ,1574 ,1575 ,1576 ,1577 ,1578 ,1579 ,1580 ,1581 ,1582 ,1583 ,1584 ,1585 ,1586 ,1587 ,1588 ,1589 ,1590 ,1591 ,1592 ,1593 ,1594 ,1595 ,1596 ,1597 ,1598 ,1599 ,1600 ,1601 ,1602 ,1603 ,1604 ,1605 ,1606 ,1607 ,1608 ,1609 ,1610 ,1611 ,1612 ,1613 ,1614 ,1615 ,1616 ,1617 ,1618 ,1619 ,1620 ,1621 ,1622 ,1623 ,1624 ,1625 ,1626 ,1627 ,1628 ,1629 ,1630 ,1631 ,1632 ,1633 ,1634 ,1635 ,1636 ,1637 ,1638 ,1639 ,1640 ,1641 ,1642 ,1643 ,1644 ,1645 ,1646 ,1647 ,1648 ,1649 ,1650 ,1651 ,1652 ,1653 ,1654 ,1655 ,1656 ,1657 ,1658 ,1659 ,1660 ,1661 ,1662 ,1663 ,1664 ,1665 ,1666 ,1667 ,1668 ,1669 ,1670 ,1671 ,1672 ,1673 ,1674 ,1675 ,1676 ,1677 ,1678 ,1679 ,1680 ,1681 ,1682 ,1683 ,1684 ,1685 ,1686 ,1687 ,1688 ,1689 ,1690 ,1691 ,1692 ,1693 ,1694 ,1695 ,1696 ,1697 ,1698 ,1699 ,1700 ,1701 ,1702 ,1703 ,1704 ,1705 ,1706 ,1707 ,1708 ,1709 ,1710 ,1711 ,1712 ,1713 ,1714 ,1715 ,1716 ,1717 ,1718 ,1719 ,1720 ,1721 ,1722 ,1723 ,1724

In [11]:
# Perform decision tree for filtered
dtree_filtered_result = perform_decision_tree(filtered)
dtree_filtered_result

0 ,1 ,2 ,

Unnamed: 0,accuracy
0,0.730033
1,0.77475
2,0.783486


In [16]:
print("original: ", dtree_original_result['accuracy'].max())
print("pso: ", dtree_pso_result['accuracy'].max())
print("ga: ", dtree_ga_result['accuracy'].max())
print("filtered: ", dtree_filtered_result['accuracy'].max())

original:  0.812603993344426
pso:  0.8053244592346089
ga:  0.752287853577371
filtered:  0.783485856905158


In [17]:
# Store results
dtree_pso_result.to_csv('results/decision_tree_pso.csv')
dtree_ga_result.to_csv('results/decision_tree_ga.csv')
dtree_filtered_result.to_csv('results/decision_tree_filtered.csv')