In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [7]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Time,Length,Protocol,Info
0,0,5.869593,66,1,4
1,0,5.881211,66,1,1
2,0,5.887175,54,1,6
3,0,5.887292,712,0,12
4,0,5.902131,1490,0,8


In [8]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

In [9]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(min_samples_split = np.array(range(2, 23)), min_samples_leaf = np.array(range(1, 21)))

# Set up and perform grid search
grid = doGridSearch(DTC(random_state = randomSeed), hyperparameters, XTrain, yTrain)

In [10]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_min_samples_split", "param_min_samples_leaf", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
0,2,1,0.974606,1
1,3,1,0.973153,2
2,4,1,0.972365,3
3,5,1,0.971946,4
4,6,1,0.970739,5
...,...,...,...,...
415,18,20,0.932118,400
416,19,20,0.932118,400
417,20,20,0.932118,400
418,21,20,0.932118,400


In [11]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
0,2,1,0.974606,1


In [12]:
# Final decision tree model
dtc = DTC(random_state = randomSeed, min_samples_split = bestHyperSet["param_min_samples_split"].iloc(0)[0], min_samples_leaf = bestHyperSet["param_min_samples_leaf"].iloc(0)[0])
dtc.fit(XTrain, yTrain)

In [13]:
# Score model
score(dtc, XTrain, yTrain, XTest, yTest)

Training score: 1.0
Testing score: 0.9751748596197419
