In [98]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import StandardScaler as SS
from joblib import dump
from config_store import targetDataSet, randomSeed
from utility_functions import dataSetSplit, doGridSearch, score

In [99]:
# Import and preview data set
data = pd.read_csv(targetDataSet)
data.head()

Unnamed: 0,Website,Packets,Bytes,Total Packets,Percent Filtered,Packets A → B,Bytes A → B,Packets B → A,Bytes B → A,Duration,Bits/s A → B,Bits/s B → A
0,0,1,712,1256,0.08,1,712,0,0,16.763719,339,0
1,0,1,718,45,2.22,1,718,0,0,10.784664,532,0
2,0,1,889,28,3.57,1,889,0,0,4.927569,1443,0
3,0,1,712,1098,0.09,1,712,0,0,13.479684,422,0
4,0,1,718,52,1.92,1,718,0,0,12.706807,452,0


In [100]:
# Separate data set using test/train split
XTrain, XTest, yTrain, yTest = dataSetSplit(data.drop(columns = ["Website"]).values, data["Website"].values)

# Scale data
ss = SS()
XTrainScaled = ss.fit_transform(XTrain)
XTestScaled = ss.transform(XTest)

In [101]:
# Define top 3 models to use in ada
dtc = DTC(random_state = randomSeed, min_samples_split = 6)
lr = LR(random_state = randomSeed, n_jobs = -1)
sgd = SGDClassifier(random_state = randomSeed, n_jobs = -1)

# Ada - DTC

In [102]:
# Perform hyperparameter optimization
# Define hyperparameters to check
hyperparameters = dict(n_estimators = np.array(range(50, 160, 10)), learning_rate = np.array(range(0, 10)))

# Set up and perform grid search
grid = doGridSearch(AdaBoostClassifier(estimator = dtc, random_state = randomSeed), hyperparameters, XTrain, yTrain)

110 fits failed out of a total of 1100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
110 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

In [103]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_n_estimators", "param_learning_rate", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_n_estimators,param_learning_rate,mean_test_score,rank_test_score
0,50,0,,100
1,60,0,,100
2,70,0,,100
3,80,0,,100
4,90,0,,100
...,...,...,...,...
105,110,9,0.888476,71
106,120,9,0.888476,71
107,130,9,0.888476,71
108,140,9,0.888476,71


In [104]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_n_estimators,param_learning_rate,mean_test_score,rank_test_score
17,110,1,0.920671,1


In [105]:
# Final model
adaDTC = AdaBoostClassifier(estimator = dtc, random_state = randomSeed, n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0], learning_rate = bestHyperSet["param_learning_rate"].iloc(0)[0])
adaDTC.fit(XTrain, yTrain)

In [106]:
# Score model
score(adaDTC, XTrain, yTrain, XTest, yTest)

Training score: 1.0
Testing score: 0.8921568627450981


In [107]:
# Save model
dump(adaDTC, "models/ada_dtc.joblib")

['models/ada_dtc.joblib']

# Ada - Log Reg

In [108]:
# Perform hyperparameter optimization
# Set up and perform grid search
grid = doGridSearch(AdaBoostClassifier(estimator = lr, random_state = randomSeed), hyperparameters, XTrainScaled, yTrain)

110 fits failed out of a total of 1100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
110 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

In [109]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_n_estimators", "param_learning_rate", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_n_estimators,param_learning_rate,mean_test_score,rank_test_score
0,50,0,,100
1,60,0,,100
2,70,0,,100
3,80,0,,100
4,90,0,,100
...,...,...,...,...
105,110,9,0.272744,46
106,120,9,0.272744,46
107,130,9,0.277622,40
108,140,9,0.272744,46


In [110]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_n_estimators,param_learning_rate,mean_test_score,rank_test_score
14,80,1,0.517134,1


In [111]:
# Final model
adaLR = AdaBoostClassifier(estimator = lr, random_state = randomSeed, n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0], learning_rate = bestHyperSet["param_learning_rate"].iloc(0)[0])
adaLR.fit(XTrainScaled, yTrain)

In [112]:
# Score model
score(adaLR, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.5272277227722773
Testing score: 0.5098039215686274


In [113]:
# Save model
dump(adaLR, "models/ada_log_reg.joblib")

['models/ada_log_reg.joblib']

# Ada - SGD

In [114]:
# Perform hyperparameter optimization
# Set up and perform grid search
grid = doGridSearch(AdaBoostClassifier(estimator = sgd, random_state = randomSeed, algorithm = "SAMME"), hyperparameters, XTrainScaled, yTrain)

110 fits failed out of a total of 1100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
110 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\okisn\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParam

In [115]:
# Check grid search results
gridDataframe = pd.DataFrame(grid.cv_results_)[["param_n_estimators", "param_learning_rate", "mean_test_score", "rank_test_score"]]
gridDataframe

Unnamed: 0,param_n_estimators,param_learning_rate,mean_test_score,rank_test_score
0,50,0,,100
1,60,0,,100
2,70,0,,100
3,80,0,,100
4,90,0,,100
...,...,...,...,...
105,110,9,0.229817,90
106,120,9,0.224756,95
107,130,9,0.224878,94
108,140,9,0.219817,98


In [116]:
# Show best hyperparameter set
bestHyperSet = gridDataframe[gridDataframe["rank_test_score"] <= 1]
bestHyperSet

Unnamed: 0,param_n_estimators,param_learning_rate,mean_test_score,rank_test_score
11,50,1,0.507744,1
12,60,1,0.507744,1
13,70,1,0.507744,1
14,80,1,0.507744,1
15,90,1,0.507744,1
16,100,1,0.507744,1
17,110,1,0.507744,1
18,120,1,0.507744,1
19,130,1,0.507744,1
20,140,1,0.507744,1


In [118]:
# Final model
adaSGD = AdaBoostClassifier(estimator = sgd, random_state = randomSeed, algorithm = "SAMME", n_estimators = bestHyperSet["param_n_estimators"].iloc(0)[0], learning_rate = bestHyperSet["param_learning_rate"].iloc(0)[0])
adaSGD.fit(XTrainScaled, yTrain)

In [119]:
# Score model
score(adaSGD, XTrainScaled, yTrain, XTestScaled, yTest)

Training score: 0.5618811881188119
Testing score: 0.5882352941176471


In [120]:
# Save model
dump(adaSGD, "models/ada_sgd.joblib")

['models/ada_sgd.joblib']