In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler as SS
from joblib import dump
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [2]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Packets,Bytes,Packets A → B,Bytes A → B,Packets B → A,Bytes B → A,Duration,Bits/s A → B,Bits/s B → A
0,0,1,712,1,712,0,0,16.763719,339,0
1,0,1,718,1,718,0,0,10.784664,532,0
2,0,1,889,1,889,0,0,4.927569,1443,0
3,0,1,712,1,712,0,0,13.479684,422,0
4,0,1,718,1,718,0,0,12.706807,452,0


In [3]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

# Scale data
ss = SS()
XTrainScaled = ss.fit_transform(XTrain)
XTestScaled = ss.transform(XTest)

In [4]:
# Define top 3 models to use in voting
dtc = DTC(random_state = randomSeed, min_samples_split = 6)
knn = KNN(n_neighbors = 6)
lr = LR()

# Voting - Hard

In [5]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(weights = list(np.array(np.meshgrid(range(1, 11), range(1, 11), range(1, 11))).T.reshape(-1, 3)))

# Set up and perform grid search
grid = doGridSearch(VotingClassifier(n_jobs = -1, voting = "hard", estimators = [("dtc", dtc), ("knn", knn), ("lr", lr)]), hyperparameters, XTrainScaled, yTrain)

In [6]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_weights", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_weights,mean_test_score,rank_test_score
0,"[1, 1, 1]",0.685549,376
1,"[1, 2, 1]",0.623659,671
2,"[1, 3, 1]",0.606159,761
3,"[1, 4, 1]",0.606159,761
4,"[1, 5, 1]",0.606159,761
...,...,...,...
995,"[10, 6, 10]",0.685610,331
996,"[10, 7, 10]",0.685610,331
997,"[10, 8, 10]",0.685610,331
998,"[10, 9, 10]",0.685610,331


In [7]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_weights,mean_test_score,rank_test_score
20,"[3, 1, 1]",0.806707,1
30,"[4, 1, 1]",0.806707,1
31,"[4, 2, 1]",0.806707,1
40,"[5, 1, 1]",0.806707,1
41,"[5, 2, 1]",0.806707,1
...,...,...,...
592,"[10, 3, 6]",0.806707,1
680,"[9, 1, 7]",0.806707,1
690,"[10, 1, 7]",0.806707,1
691,"[10, 2, 7]",0.806707,1


In [8]:
# Final voting model
votingHard = VotingClassifier(n_jobs = -1, voting = "hard", estimators = [("dtc", dtc), ("knn", knn), ("lr", lr)], weights = bestHyperSet["param_weights"].iloc(0)[0])
votingHard.fit(XTrainScaled, yTrain)

In [9]:
# Score model
score(votingHard, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.943069306930693
Testing score: 0.7254901960784313


In [10]:
# Save model
dump(votingHard, "models/voting_hard.joblib")

['models/voting_hard.joblib']

# Voting - Soft

In [11]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(weights = list(np.array(np.meshgrid(range(1, 11), range(1, 11), range(1, 11))).T.reshape(-1, 3)))

# Set up and perform grid search
grid = doGridSearch(VotingClassifier(n_jobs = -1, voting = "soft", estimators = [("dtc", dtc), ("knn", knn), ("lr", lr)]), hyperparameters, XTrainScaled, yTrain)

In [12]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_weights", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_weights,mean_test_score,rank_test_score
0,"[1, 1, 1]",0.816829,64
1,"[1, 2, 1]",0.762195,726
2,"[1, 3, 1]",0.722500,814
3,"[1, 4, 1]",0.702744,865
4,"[1, 5, 1]",0.697744,885
...,...,...,...
995,"[10, 6, 10]",0.819146,14
996,"[10, 7, 10]",0.819146,14
997,"[10, 8, 10]",0.814268,240
998,"[10, 9, 10]",0.814268,250


In [13]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_weights,mean_test_score,rank_test_score
653,"[6, 4, 7]",0.821646,1
974,"[8, 5, 10]",0.821646,1


In [14]:
# Final voting model
votingSoft = VotingClassifier(n_jobs = -1, voting = "soft", estimators = [("dtc", dtc), ("knn", knn), ("lr", lr)], weights = bestHyperSet["param_weights"].iloc(0)[0])
votingSoft.fit(XTrainScaled, yTrain)

In [15]:
# Score model
score(votingSoft, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.9455445544554455
Testing score: 0.7549019607843137


In [16]:
# Save model
dump(votingSoft, "models/voting_soft.joblib")

['models/voting_soft.joblib']