In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [2]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Length,Protocol,Info
0,0,66,1,4
1,0,66,1,1
2,0,54,1,6
3,0,712,0,12
4,0,1490,0,8


In [3]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

In [4]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(min_samples_split = np.array(range(2, 23)), min_samples_leaf = np.array(range(1, 21)))

# Set up and perform grid search
grid = doGridSearch(DTC(random_state = randomSeed), hyperparameters, XTrain, yTrain)

In [5]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_min_samples_split", "param_min_samples_leaf", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
0,2,1,0.717537,378
1,3,1,0.718103,328
2,4,1,0.718005,331
3,5,1,0.717906,355
4,6,1,0.717783,356
...,...,...,...,...
415,18,20,0.717389,379
416,19,20,0.717389,379
417,20,20,0.717389,379
418,21,20,0.717389,379


In [6]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
31,12,2,0.719409,1


In [7]:
# Final decision tree model
dtc = DTC(random_state = randomSeed, min_samples_split = bestHyperSet["param_min_samples_split"].iloc(0)[0], min_samples_leaf = bestHyperSet["param_min_samples_leaf"].iloc(0)[0])
dtc.fit(XTrain, yTrain)

In [8]:
# Score model
score(dtc, XTrain, yTrain, XTest, yTest)

Training score: 0.7326354679802956
Testing score: 0.7213082454930548
