In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as RFC
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [34]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Time,Length,Protocol,Info
0,0,5.869593,66,1,4
1,0,5.881211,66,1,1
2,0,5.887175,54,1,6
3,0,5.887292,712,0,12
4,0,5.902131,1490,0,8


In [35]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

In [36]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(n_estimators = np.array(range(100, 501, 100)), min_samples_split = np.array(range(2, 11)), min_samples_leaf = np.array(range(1, 11)))

# Set up and perform grid search
grid = doGridSearch(RFC(random_state = randomSeed), hyperparameters, XTrain, yTrain)

In [37]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_n_estimators", "param_min_samples_split", "param_min_samples_leaf", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_n_estimators,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
0,100,2,1,0.965049,9
1,200,2,1,0.965443,5
2,300,2,1,0.965345,6
3,400,2,1,0.965567,4
4,500,2,1,0.965320,7
...,...,...,...,...,...
445,100,10,10,0.925616,442
446,200,10,10,0.925813,433
447,300,10,10,0.926700,415
448,400,10,10,0.926675,424


In [38]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_n_estimators,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
8,400,3,1,0.965936,1


In [39]:
# Final random forest model
rfc = RFC(random_state = randomSeed, n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0], min_samples_split = bestHyperSet["param_min_samples_split"].iloc(0)[0], min_samples_leaf = bestHyperSet["param_min_samples_leaf"].iloc(0)[0])
rfc.fit(XTrain, yTrain)

In [40]:
# Score model
score(rfc, XTrain, yTrain, XTest, yTest)

Training score: 0.9991871921182266
Testing score: 0.9692641119101566
