In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler as SS
from joblib import dump
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [7]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Packets,Bytes,Total Packets,Percent Filtered,Packets A → B,Bytes A → B,Packets B → A,Bytes B → A,Duration,Bits/s A → B,Bits/s B → A
0,0,1,712,1256,0.08,1,712,0,0,16.763719,339,0
1,0,1,718,45,2.22,1,718,0,0,10.784664,532,0
2,0,1,889,28,3.57,1,889,0,0,4.927569,1443,0
3,0,1,712,1098,0.09,1,712,0,0,13.479684,422,0
4,0,1,718,52,1.92,1,718,0,0,12.706807,452,0


In [8]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

# Scale data
ss = SS()
XTrainScaled = ss.fit_transform(XTrain)
XTestScaled = ss.transform(XTest)

In [9]:
# Define top 3 models to use in bagging
dtc = DTC(random_state = randomSeed, min_samples_split = 6)
knn = KNN(n_neighbors = 6)
lr = LR()

# Voting - Hard

In [10]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(weights = list(np.array(np.meshgrid(range(1, 11), range(1, 11), range(1, 11))).T.reshape(-1, 3)))

# Set up and perform grid search
grid = doGridSearch(VotingClassifier(n_jobs = -1, voting = "hard", estimators = [("dtc", dtc), ("knn", knn), ("lr", lr)]), hyperparameters, XTrainScaled, yTrain)

In [11]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_weights", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_weights,mean_test_score,rank_test_score
0,"[1, 1, 1]",0.764756,376
1,"[1, 2, 1]",0.710366,671
2,"[1, 3, 1]",0.695610,716
3,"[1, 4, 1]",0.695610,716
4,"[1, 5, 1]",0.695610,716
...,...,...,...
995,"[10, 6, 10]",0.767317,331
996,"[10, 7, 10]",0.767317,331
997,"[10, 8, 10]",0.767317,331
998,"[10, 9, 10]",0.767317,331


In [12]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_weights,mean_test_score,rank_test_score
20,"[3, 1, 1]",0.886159,1
30,"[4, 1, 1]",0.886159,1
31,"[4, 2, 1]",0.886159,1
40,"[5, 1, 1]",0.886159,1
41,"[5, 2, 1]",0.886159,1
...,...,...,...
592,"[10, 3, 6]",0.886159,1
680,"[9, 1, 7]",0.886159,1
690,"[10, 1, 7]",0.886159,1
691,"[10, 2, 7]",0.886159,1


In [13]:
# Final voting model
votingHard = VotingClassifier(n_jobs = -1, voting = "hard", estimators = [("dtc", dtc), ("knn", knn), ("lr", lr)], weights = bestHyperSet["param_weights"].iloc(0)[0])
votingHard.fit(XTrainScaled, yTrain)

In [14]:
# Score model
score(votingHard, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.9678217821782178
Testing score: 0.8333333333333334


In [15]:
# Save model
dump(votingHard, "models/voting_hard.joblib")

['models/voting_hard.joblib']

# Voting - Soft

In [16]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(weights = list(np.array(np.meshgrid(range(1, 11), range(1, 11), range(1, 11))).T.reshape(-1, 3)))

# Set up and perform grid search
grid = doGridSearch(VotingClassifier(n_jobs = -1, voting = "soft", estimators = [("dtc", dtc), ("knn", knn), ("lr", lr)]), hyperparameters, XTrainScaled, yTrain)

In [17]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_weights", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_weights,mean_test_score,rank_test_score
0,"[1, 1, 1]",0.866463,479
1,"[1, 2, 1]",0.819512,745
2,"[1, 3, 1]",0.792256,833
3,"[1, 4, 1]",0.772378,879
4,"[1, 5, 1]",0.767439,892
...,...,...,...
995,"[10, 6, 10]",0.883720,168
996,"[10, 7, 10]",0.876280,329
997,"[10, 8, 10]",0.878780,249
998,"[10, 9, 10]",0.871341,446


In [18]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_weights,mean_test_score,rank_test_score
274,"[8, 5, 3]",0.88872,1


In [19]:
# Final voting model
votingSoft = VotingClassifier(n_jobs = -1, voting = "soft", estimators = [("dtc", dtc), ("knn", knn), ("lr", lr)], weights = bestHyperSet["param_weights"].iloc(0)[0])
votingSoft.fit(XTrainScaled, yTrain)

In [20]:
# Score model
score(votingSoft, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.9777227722772277
Testing score: 0.8431372549019608


In [21]:
# Save model
dump(votingSoft, "models/voting_soft.joblib")

['models/voting_soft.joblib']