# Hyperparameters Tuning

In [5]:
import numpy as np
import pandas as pd

In [6]:
train = pd.read_csv('/home/frenk/Documenti/Artificial Intelligence/UCI_HAR_Dataset/train.csv')
test = pd.read_csv('/home/frenk/Documenti/Artificial Intelligence/UCI_HAR_Dataset/test.csv')

In [7]:
X_train = train.drop(['subject', 'Activity', 'ActivityName'], axis=1)
y_train = train.ActivityName

In [8]:
X_test = test.drop(['subject', 'Activity', 'ActivityName'], axis=1)
y_test = test.ActivityName

In [5]:
labels=['LAYING', 'SITTING','STANDING','WALKING','WALKING_DOWNSTAIRS','WALKING_UPSTAIRS']

In [6]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

dt = DecisionTreeClassifier()

Analyzing best value for min_samples_leaf

In [7]:
accuracy_res = [0]*10
for j in range (0,10):
    for i in range (0,10):
        dt = DecisionTreeClassifier(min_samples_leaf = i+1)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)
        accuracy_res[i] += metrics.accuracy_score(y_test, y_pred)

In [8]:
for i in range (0,10):
    accuracy_res[i] = accuracy_res[i]/10
print (accuracy_res)
max_accuracy = max(accuracy_res)
max_index = accuracy_res.index(max_accuracy)
print('Max Accuracy: ', max_accuracy, 'obtained with min_samples_leaf = ', max_index+1)

[0.8575500508992195, 0.8607736681370886, 0.8654903291482864, 0.8696640651510009, 0.8730912792670512, 0.8749915167967425, 0.8696301323379709, 0.8723447573803866, 0.8716321683067527, 0.8731252120800816]
Max Accuracy:  0.8749915167967425 obtained with min_samples_leaf =  6


Doing the same thing for max_depth starting from 4 to 13 as values for max depth

In [9]:
accuracy_md = [0]*10
for j in range (0,10):
    for i in range (0,10):
        dt = DecisionTreeClassifier(max_depth = i+4)
        dt.fit(X_train, y_train)
        y_pred = dt.predict(X_test)
        accuracy_md[i] += metrics.accuracy_score(y_test, y_pred)

In [11]:
for i in range (0,10):
    accuracy_md[i] = accuracy_md[i]/10
print (accuracy_md)
max_accuracy_md = max(accuracy_md)
max_index = accuracy_md.index(max_accuracy_md)
print('Max Accuracy: ', max_accuracy, 'obtained with max_depth = ', max_index+4)

[0.0809636918900577, 0.08388870037326095, 0.08547336274177128, 0.08640990838140483, 0.0872718018323719, 0.08731591448931117, 0.08668137088564641, 0.08667797760434341, 0.0864234815066169, 0.08625381744146589]
Max Accuracy:  0.8749915167967425 obtained with max_depth =  9


Comparing entropy and gini

In [12]:
accuracy_res_t = [0]*2
for i in range (0,10):
    dte = DecisionTreeClassifier(criterion = 'entropy')
    dtg = DecisionTreeClassifier(criterion = 'gini')
    dte.fit(X_train, y_train)
    dtg.fit(X_train, y_train)
    y_pred_e = dte.predict(X_test)
    y_pred_g = dtg.predict(X_test)
    accuracy_res_t[0] += metrics.accuracy_score(y_test, y_pred_e)
    accuracy_res_t[1] += metrics.accuracy_score(y_test, y_pred_g)

In [15]:
for i in range (0,2):
    accuracy_res_t[i] = accuracy_res_t[i]/10
print (accuracy_res_t)
max_accuracy_res_t = max(accuracy_res_t)
max_index = accuracy_res_t.index(max_accuracy_res_t)
print('Max Accuracy: ', max_accuracy, 'obtained in case = ', max_index)
print('\n 0 stands for entropy, 1 for gini')

[0.8546657617916524, 0.859653885307092]
Max Accuracy:  0.8749915167967425 obtained in case =  1

 0 stands for entropy, 1 for gini


Testing all together

In [16]:
dt = DecisionTreeClassifier(criterion = 'gini', min_samples_leaf = 6, max_depth = 9)
dt.fit(X_train,y_train)
y_pred = dt.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)

Accuracy:  0.8771632168306752


## GridSearch CV

In [20]:
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
depths = np.arange(4, 20, 1)
samples= np.arange(2, 10, 1)
parameters = {'criterion':['gini', 'entropy'], 'max_depth': depths,  'min_samples_leaf':samples, 'min_samples_split':samples}
dt = DecisionTreeClassifier()
dt_grid = GridSearchCV(dt,param_grid=parameters)
dt_grid.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                

In [21]:
print("Tuned Decision Tree Parameters: {}".format(dt_grid.best_params_))
print("Best Score is {}".format(dt_grid.best_score_))

Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': 16, 'min_samples_leaf': 6, 'min_samples_split': 7}
Best Score is 0.8722822643673377
