In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler as SS
from joblib import dump
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [2]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Packets,Bytes,Total Packets,Percent Filtered,Packets A → B,Bytes A → B,Packets B → A,Bytes B → A,Duration,Bits/s A → B,Bits/s B → A
0,0,1,712,1256,0.08,1,712,0,0,16.763719,339,0
1,0,1,718,45,2.22,1,718,0,0,10.784664,532,0
2,0,1,889,28,3.57,1,889,0,0,4.927569,1443,0
3,0,1,712,1098,0.09,1,712,0,0,13.479684,422,0
4,0,1,718,52,1.92,1,718,0,0,12.706807,452,0


In [3]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

# Scale data
ss = SS()
XTrainScaled = ss.fit_transform(XTrain)
XTestScaled = ss.transform(XTest)

In [4]:
# Define top 3 models to use in bagging
dtc = DTC(random_state = randomSeed, min_samples_split = 6)
knn = KNN(n_neighbors = 6)
lr = LR()

# Bagging - DTC

In [5]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(max_features = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], max_samples = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], n_estimators = np.array(range(10, 110, 10)))

# Set up and perform grid search
grid = doGridSearch(BaggingClassifier(estimator = dtc, n_jobs = -1, random_state = randomSeed), hyperparameters, XTrain, yTrain)

In [6]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_max_features", "param_max_samples", "param_n_estimators", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
0,0.1,0.1,10,0.532500,899
1,0.1,0.1,20,0.697561,875
2,0.1,0.1,30,0.692683,877
3,0.1,0.1,40,0.717317,869
4,0.1,0.1,50,0.742073,859
...,...,...,...,...,...
995,1,1,60,0.175488,991
996,1,1,70,0.210488,931
997,1,1,80,0.210671,901
998,1,1,90,0.210671,901


In [7]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
385,0.4,0.9,60,0.913171,1


In [8]:
# Final model
bagDTC = BaggingClassifier(estimator = dtc, n_jobs = -1, random_state = randomSeed, max_features = bestHyperSet["param_max_features"].iloc(0)[0], max_samples = bestHyperSet["param_max_samples"].iloc(0)[0], n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0])
bagDTC.fit(XTrain, yTrain)

In [9]:
# Score model
score(bagDTC, XTrain, yTrain, XTest, yTest)

Training score: 0.9876237623762376
Testing score: 0.8725490196078431


In [10]:
# Save model
dump(bagDTC, "models/bagging_dtc.joblib")

['models/bagging_dtc.joblib']

# Bagging - KNN

In [11]:
# Perform hyperparameter optimization
# Set up and perform grid search
grid = doGridSearch(BaggingClassifier(estimator = knn, n_jobs = -1, random_state = randomSeed), hyperparameters, XTrainScaled, yTrain)

 0.67804878 0.68573171 0.68304878 0.6804878  0.62121951 0.67731707
 0.67243902 0.71731707 0.71987805 0.72237805 0.73493902 0.74237805
 0.74256098 0.74518293 0.64628049 0.71457317 0.70481707 0.75219512
 0.76957317 0.76469512 0.75518293 0.77219512 0.77981707 0.77262195
 0.66109756 0.73689024 0.73445122 0.76670732 0.77926829 0.78676829
 0.78219512 0.79207317 0.7945122  0.7995122  0.67597561 0.76426829
 0.75682927 0.77926829 0.78408537 0.80164634 0.78707317 0.79439024
 0.81176829 0.8045122  0.68323171 0.74957317 0.74189024 0.78414634
 0.78182927 0.80402439 0.80189024 0.80682927 0.8045122  0.80445122
 0.6854878  0.7570122  0.76176829 0.77926829 0.78170732 0.79908537
 0.79670732 0.79676829 0.80182927 0.80445122 0.68786585 0.75426829
 0.75158537 0.77670732 0.79158537 0.79152439 0.80670732 0.79695122
 0.80195122 0.8045122  0.68780488 0.75432927 0.76164634 0.78182927
 0.78408537 0.79902439 0.79670732 0.79676829 0.81408537 0.81426829
        nan        nan        nan        nan        nan       

In [12]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_max_features", "param_max_samples", "param_n_estimators", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
0,0.1,0.1,10,0.576829,830
1,0.1,0.1,20,0.596220,811
2,0.1,0.1,30,0.578902,827
3,0.1,0.1,40,0.645854,766
4,0.1,0.1,50,0.655671,749
...,...,...,...,...,...
995,1,1,60,,901
996,1,1,70,,901
997,1,1,80,,901
998,1,1,90,,901


In [13]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
162,0.2,0.7,30,0.861402,1


In [14]:
# Final model
bagKNN = BaggingClassifier(estimator = knn, n_jobs = -1, random_state = randomSeed, max_features = bestHyperSet["param_max_features"].iloc(0)[0], max_samples = bestHyperSet["param_max_samples"].iloc(0)[0], n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0])
bagKNN.fit(XTrainScaled, yTrain)

In [15]:
# Score model
score(bagKNN, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.9331683168316832
Testing score: 0.8529411764705882


In [16]:
# Save model
dump(bagKNN, "models/bagging_knn.joblib")

['models/bagging_knn.joblib']

# Bagging - Log Reg

In [17]:
# Perform hyperparameter optimization
# Set up and perform grid search
grid = doGridSearch(BaggingClassifier(estimator = lr, n_jobs = -1, random_state = randomSeed), hyperparameters, XTrainScaled, yTrain)

In [18]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_max_features", "param_max_samples", "param_n_estimators", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
0,0.1,0.1,10,0.334390,741
1,0.1,0.1,20,0.304695,781
2,0.1,0.1,30,0.287195,809
3,0.1,0.1,40,0.294329,797
4,0.1,0.1,50,0.301829,789
...,...,...,...,...,...
995,1,1,60,0.182988,984
996,1,1,70,0.202927,923
997,1,1,80,0.210671,906
998,1,1,90,0.210671,906


In [19]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_max_features,param_max_samples,param_n_estimators,mean_test_score,rank_test_score
881,0.9,0.9,20,0.547073,1


In [20]:
# Final model
bagLR = BaggingClassifier(estimator = lr, n_jobs = -1, random_state = randomSeed, max_features = bestHyperSet["param_max_features"].iloc(0)[0], max_samples = bestHyperSet["param_max_samples"].iloc(0)[0], n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0])
bagLR.fit(XTrainScaled, yTrain)

In [21]:
# Score model
score(bagLR, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.5693069306930693
Testing score: 0.6078431372549019


In [22]:
# Save model
dump(bagLR, "models/bagging_log_reg.joblib")

['models/bagging_log_reg.joblib']