In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier as DTC

In [16]:
# Import and preview data set
data = pd.read_csv("sanitized_data/sanitized_data_2024-04-17_10-31-46-218385_mapped.csv")
data.head()

Unnamed: 0,Website,Time,Length,Protocol,Info
0,0,5.869593,66,1,4
1,0,5.881211,66,1,1
2,0,5.887175,54,1,6
3,0,5.887292,712,0,12
4,0,5.902131,1490,0,8


In [17]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = tts(data.drop(columns = ['Website']).values, data['Website'].values, random_state = 201, shuffle = True, test_size = 0.20)

In [18]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(min_samples_split = np.array(range(2, 23)), min_samples_leaf = np.array(range(1, 21)))
# Set up k-fold cross validation object
crossValidation = KFold(n_splits = 10, random_state = 146, shuffle = True)
# Set up and perform grid search
grid = GridSearchCV(DTC(random_state = 146), param_grid = hyperparameters, cv = crossValidation, scoring = 'accuracy')
grid.fit(XTrain, yTrain)

In [22]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_min_samples_split", "param_min_samples_leaf", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
0,2,1,0.973202,1
1,3,1,0.972882,2
2,4,1,0.972315,3
3,5,1,0.971355,4
4,6,1,0.969754,5
...,...,...,...,...
415,18,20,0.931798,400
416,19,20,0.931798,400
417,20,20,0.931798,400
418,21,20,0.931798,400


In [27]:
# Get best hyperparameter set
gridDataframe[gridDataframe["rank_test_score"] <= 1]

Unnamed: 0,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
0,2,1,0.973202,1


In [29]:
# Final decision tree model
dtc = DTC(random_state = 146, min_samples_split = 2, min_samples_leaf = 1)
dtc.fit(XTrain, yTrain)

In [30]:
# Score model
dtc.score(XTrain, yTrain)

1.0

In [31]:
dtc.score(XTest, yTest)

0.9753718845433947