## Decision Tree with different hyperparameters

In this example, we build different decision trees with different hyperparameters.  Hyperparameters are parameter values used to configure the model.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

### Load dataset
Let's use some large financial data from UCI 


In [2]:
df_lg = pd.read_csv('Wholesale.csv')
df_lg.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [3]:
X2_train, X2_test, y2_train, y2_test = train_test_split(df_lg.iloc[:,2:], df_lg['Channel'], random_state =42)

In [4]:
X2_train.head()

Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
314,10617,1795,7647,1483,857,1233
3,13265,1196,4221,6404,507,1788
18,18601,6327,10099,2205,2767,3181
393,23632,6730,3842,8620,385,819
60,8590,3045,7854,96,4095,225


### Build a function to build a decision tree
This function accepts different hyperparamters and print the accuracy scores

In [5]:
def decTreeScore2(crit = 'gini',  maxDepth = None, minSamples = 1, minSplit = 2):
    dect = DecisionTreeClassifier(criterion = crit, max_depth = maxDepth, min_samples_leaf = minSamples, 
                                 min_samples_split = minSplit, random_state= 42)
    dect.fit(X2_train, y2_train)
    accuracy = accuracy_score(y2_test, dect.predict(X2_test))
    print(accuracy)
    return accuracy

In [6]:
decTreeScore2()
# the accuracy score for our model with default hyperparams is about 0.88

0.8818181818181818


0.8818181818181818

In [7]:
decTreeScore2(crit = 'entropy')
# if we use entropy to calculate infomation gain instead of gini score, the accuracy drops

0.8545454545454545


0.8545454545454545

In [8]:
# use different maximum depth of the tree
for i in np.arange(1, 15, 1):
    decTreeScore2(maxDepth = i)
    
    
# from the result we see that when maximum depth allowed is 2 or 5, the accuracy score is highest

0.8818181818181818
0.9
0.8818181818181818
0.8909090909090909
0.9
0.8727272727272727
0.8636363636363636
0.8727272727272727
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818


In [9]:
for i in np.arange(1, 15, 1):
    decTreeScore2(minSamples = i)
    
# when the minimum samples allowed in a leaf is  greater than 8, the accuracy score is the highest

0.8818181818181818
0.8909090909090909
0.8909090909090909
0.8818181818181818
0.8727272727272727
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.9
0.9
0.9
0.9
0.9
0.9


In [10]:
for i in np.arange(2, 15,1):
    decTreeScore2(minSplit = i)
    
# the value of minimum samples required to split doesn't really affect accuracy much

0.8818181818181818
0.8909090909090909
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
0.8818181818181818
