In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler as SS
from joblib import dump
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [2]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Packets,Bytes,Packets A → B,Bytes A → B,Packets B → A,Bytes B → A,Duration,Bits/s A → B,Bits/s B → A
0,0,1,712,1,712,0,0,16.763719,339,0
1,0,1,718,1,718,0,0,10.784664,532,0
2,0,1,889,1,889,0,0,4.927569,1443,0
3,0,1,712,1,712,0,0,13.479684,422,0
4,0,1,718,1,718,0,0,12.706807,452,0


In [3]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

# Scale data
ss = SS()
XTrainScaled = ss.fit_transform(XTrain)
XTestScaled = ss.transform(XTest)

In [4]:
# Define top 3 models to use in bagging
dtc = DTC(random_state = randomSeed, min_samples_split = 6)
knn = KNN(n_neighbors = 6)
lr = LR()

# Bagging - DTC

In [5]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(max_features = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], max_samples = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], n_estimators = np.array(range(10, 110, 10)))

# Set up and perform grid search
grid = doGridSearch(BaggingClassifier(estimator = dtc, n_jobs = -1, random_state = randomSeed), hyperparameters, XTrain, yTrain)

In [6]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_max_features", "param_max_samples", "param_n_estimators", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
0,0.1,0.1,10,0.517256,898
1,0.1,0.1,20,0.554268,895
2,0.1,0.1,30,0.574146,892
3,0.1,0.1,40,0.648110,858
4,0.1,0.1,50,0.687866,793
...,...,...,...,...,...
995,1.0,1.0,60,0.183354,911
996,1.0,1.0,70,0.178354,941
997,1.0,1.0,80,0.183354,911
998,1.0,1.0,90,0.170854,971


In [7]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
466,0.5,0.7,70,0.861098,1
479,0.5,0.8,100,0.861098,1


In [8]:
# Final model
bagDTC = BaggingClassifier(estimator = dtc, n_jobs = -1, random_state = randomSeed, max_features = bestHyperSet["param_max_features"].iloc(0)[0], max_samples = bestHyperSet["param_max_samples"].iloc(0)[0], n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0])
bagDTC.fit(XTrain, yTrain)

In [9]:
# Score model
score(bagDTC, XTrain, yTrain, XTest, yTest)

Training score: 0.9306930693069307
Testing score: 0.8333333333333334


In [10]:
# Save model
dump(bagDTC, "models/bagging_dtc.joblib")

['models/bagging_dtc.joblib']

In [8]:
from sklearn.metrics import recall_score, precision_score, f1_score
from joblib import load

model = load("models/bagging_dtc.joblib")
print(recall_score(yTest, model.predict(XTest), average="macro"))
print(precision_score(yTest, model.predict(XTest), average="macro"))
print(f1_score(yTest, model.predict(XTest), average="macro"))

0.8347954333248451
0.8346087842251524
0.8325076670313946


# Bagging - KNN

In [11]:
# Perform hyperparameter optimization
# Set up and perform grid search
grid = doGridSearch(BaggingClassifier(estimator = knn, n_jobs = -1, random_state = randomSeed), hyperparameters, XTrainScaled, yTrain)

 0.59109756 0.62823171 0.63341463 0.63829268 0.56170732 0.57920732
 0.61676829 0.67530488 0.71       0.6802439  0.67786585 0.7
 0.70006098 0.70993902 0.60621951 0.66054878 0.68359756 0.74237805
 0.7570122  0.70493902 0.7097561  0.7172561  0.7175     0.71993902
 0.62115854 0.68786585 0.67359756 0.7225     0.7425     0.72487805
 0.72481707 0.73719512 0.7325     0.73237805 0.60878049 0.66792683
 0.67335366 0.75237805 0.75981707 0.74463415 0.7345122  0.7470122
 0.75213415 0.75463415 0.65323171 0.69030488 0.68060976 0.7525
 0.76457317 0.7570122  0.75682927 0.7645122  0.75207317 0.7472561
 0.63567073 0.68768293 0.69304878 0.74256098 0.76207317 0.7520122
 0.75926829 0.75457317 0.7497561  0.76689024 0.65323171 0.70286585
 0.66829268 0.73993902 0.75963415 0.75932927 0.76445122 0.74969512
 0.7595122  0.75945122 0.65030488 0.67536585 0.65804878 0.73481707
 0.75445122 0.75189024 0.75189024 0.74457317 0.75689024 0.75945122
        nan        nan        nan        nan        nan        nan
        n

In [12]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_max_features", "param_max_samples", "param_n_estimators", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
0,0.1,0.1,10,0.483354,888
1,0.1,0.1,20,0.517378,871
2,0.1,0.1,30,0.564146,843
3,0.1,0.1,40,0.623537,736
4,0.1,0.1,50,0.640793,677
...,...,...,...,...,...
995,1.0,1.0,60,,901
996,1.0,1.0,70,,901
997,1.0,1.0,80,,901
998,1.0,1.0,90,,901


In [13]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
384,0.4,0.9,50,0.82378,1


In [14]:
# Final model
bagKNN = BaggingClassifier(estimator = knn, n_jobs = -1, random_state = randomSeed, max_features = bestHyperSet["param_max_features"].iloc(0)[0], max_samples = bestHyperSet["param_max_samples"].iloc(0)[0], n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0])
bagKNN.fit(XTrainScaled, yTrain)

In [15]:
# Score model
score(bagKNN, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.8836633663366337
Testing score: 0.7843137254901961


In [16]:
# Save model
dump(bagKNN, "models/bagging_knn.joblib")

['models/bagging_knn.joblib']

In [9]:
from sklearn.metrics import recall_score, precision_score, f1_score
from joblib import load

model = load("models/bagging_knn.joblib")
print(recall_score(yTest, model.predict(XTestScaled), average="macro"))
print(precision_score(yTest, model.predict(XTestScaled), average="macro"))
print(f1_score(yTest, model.predict(XTestScaled), average="macro"))

0.7923117731941262
0.7940562456866804
0.7818316460120203


# Bagging - Log Reg

In [17]:
# Perform hyperparameter optimization
# Set up and perform grid search
grid = doGridSearch(BaggingClassifier(estimator = lr, n_jobs = -1, random_state = randomSeed), hyperparameters, XTrainScaled, yTrain)

In [18]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_max_features", "param_max_samples", "param_n_estimators", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
0,0.1,0.1,10,0.344268,625
1,0.1,0.1,20,0.289695,823
2,0.1,0.1,30,0.269695,886
3,0.1,0.1,40,0.296646,787
4,0.1,0.1,50,0.318963,675
...,...,...,...,...,...
995,1.0,1.0,60,0.183354,927
996,1.0,1.0,70,0.178354,947
997,1.0,1.0,80,0.165976,981
998,1.0,1.0,90,0.170854,968


In [19]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
438,0.5,0.4,90,0.490122,1


In [20]:
# Final model
bagLR = BaggingClassifier(estimator = lr, n_jobs = -1, random_state = randomSeed, max_features = bestHyperSet["param_max_features"].iloc(0)[0], max_samples = bestHyperSet["param_max_samples"].iloc(0)[0], n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0])
bagLR.fit(XTrainScaled, yTrain)

In [21]:
# Score model
score(bagLR, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.48514851485148514
Testing score: 0.49019607843137253


In [22]:
# Save model
dump(bagLR, "models/bagging_log_reg.joblib")

['models/bagging_log_reg.joblib']

In [10]:
from sklearn.metrics import recall_score, precision_score, f1_score
from joblib import load

model = load("models/bagging_log_reg.joblib")
print(recall_score(yTest, model.predict(XTestScaled), average="macro"))
print(precision_score(yTest, model.predict(XTestScaled), average="macro"))
print(f1_score(yTest, model.predict(XTestScaled), average="macro"))

0.46106866989219936
0.40701754385964917
0.40227646614201235
