In [9]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier as RFC
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [10]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Length,Protocol,Info
0,0,66,1,4
1,0,66,1,1
2,0,54,1,6
3,0,712,0,12
4,0,1490,0,8


In [11]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

In [12]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(n_estimators = np.array(range(100, 501, 100)), min_samples_split = np.array(range(2, 21)), min_samples_leaf = np.array(range(1, 21)))

# Set up and perform grid search
grid = doGridSearch(RFC(random_state = randomSeed), hyperparameters, XTrain, yTrain)

In [13]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_n_estimators", "param_min_samples_split", "param_min_samples_leaf", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_n_estimators,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
0,100,2,1,0.717635,453
1,200,2,1,0.717931,446
2,300,2,1,0.717783,450
3,400,2,1,0.718276,430
4,500,2,1,0.718300,429
...,...,...,...,...,...
1895,100,20,20,0.702562,1806
1896,200,20,20,0.701921,1882
1897,300,20,20,0.702340,1825
1898,400,20,20,0.702635,1749


In [14]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_n_estimators,param_min_samples_split,param_min_samples_leaf,mean_test_score,rank_test_score
225,100,9,3,0.721552,1


In [15]:
# Final random forest model
rfc = RFC(random_state = randomSeed, n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0], min_samples_split = bestHyperSet["param_min_samples_split"].iloc(0)[0], min_samples_leaf = bestHyperSet["param_min_samples_leaf"].iloc(0)[0])
rfc.fit(XTrain, yTrain)

In [16]:
# Score model
score(rfc, XTrain, yTrain, XTest, yTest)

Training score: 0.7330049261083744
Testing score: 0.7243621318096739
