In [5]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import StackingClassifier
from sklearn.preprocessing import StandardScaler as SS
from joblib import dump
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, score

In [6]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Packets,Bytes,Total Packets,Percent Filtered,Packets A → B,Bytes A → B,Packets B → A,Bytes B → A,Duration,Bits/s A → B,Bits/s B → A
0,0,1,712,1256,0.08,1,712,0,0,16.763719,339,0
1,0,1,718,45,2.22,1,718,0,0,10.784664,532,0
2,0,1,889,28,3.57,1,889,0,0,4.927569,1443,0
3,0,1,712,1098,0.09,1,712,0,0,13.479684,422,0
4,0,1,718,52,1.92,1,718,0,0,12.706807,452,0


In [7]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

# Scale data
ss = SS()
XTrainScaled = ss.fit_transform(XTrain)
XTestScaled = ss.transform(XTest)

In [8]:
# Define top 3 models to use in stacking
dtc = DTC(random_state = randomSeed, min_samples_split = 6)
knn = KNN(n_neighbors = 6)
lr = LR(random_state = randomSeed, n_jobs = -1)

In [9]:
# Final stacking model
stacking = StackingClassifier(estimators = [("dtc", dtc), ("knn", knn)], final_estimator = lr, cv = 10, n_jobs = -1)
stacking.fit(XTrainScaled, yTrain)

In [10]:
# Score model
score(stacking, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.9752475247524752
Testing score: 0.8431372549019608


In [11]:
# Save model
dump(stacking, "models/stacking.joblib")

['models/stacking.joblib']