In [31]:
import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import pickle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    accuracy_score
)

from time import time

from sklearn.metrics import classification_report

from os import listdir
from os.path import isfile, join
import os

from sklearn.base import BaseEstimator, TransformerMixin

In [32]:
"""
This is a custom transformer that allows us to reduce the feature sets for each classifier appropriately.

Necessary since we're making them all part of a StackingClassifier and each one uses a different feature set.
"""
class FeatureReducer(BaseEstimator, TransformerMixin):
    def __init__(self, feature_list=None):
        self.feature_list = feature_list
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        if self.feature_list is None:
            return X
        
        else:
            return X.loc[:, self.feature_list]

In [28]:
multiclass_train = pd.read_csv("../multiclass_train.csv")
multiclass_test = pd.read_csv("../multiclass_test.csv")
binary_train = pd.read_csv("../binary_train.csv")
binary_test = pd.read_csv("../binary_test.csv")

multi_train_y = multiclass_train[" Label"].copy()
multi_train_x = multiclass_train.drop([" Label"], axis = 1)

multi_test_y = multiclass_test[" Label"].copy()
multi_test_x = multiclass_test.drop([" Label"], axis = 1)

bin_train_y = binary_train[" Label"].copy()
bin_train_x = binary_train.drop([" Label"], axis = 1)

bin_test_y = binary_test[" Label"].copy()
bin_test_x = binary_test.drop([" Label"], axis = 1)

# Encoding
bin_train_y = [0 if x=="BENIGN" else 1 for x in bin_train_y.values]
bin_test_y = [0 if x=="BENIGN" else 1 for x in bin_test_y.values]

# Encode attack labels to int and save as array to be used later.
le = LabelEncoder()

multi_y_train = le.fit_transform(multi_train_y.values)
multi_y_test = le.transform(multi_test_y.values)

multiclass_labels = []
print("\nMulticlass Label Encodings (in order of digits 0 -> n): ")
for i in range(0, len(list(set(list(multi_y_test))))):
    multiclass_labels.append(le.inverse_transform([i])[0])

print(multiclass_labels)

binary_labels = ["BENIGN", "ATTACK"]


Multiclass Label Encodings (in order of digits 0 -> n): 
['DNS', 'LDAP', 'MSSQL', 'NTP', 'NetBIOS', 'Portmap', 'SNMP', 'SSDP', 'Syn', 'TFTP', 'UDP', 'UDP-lag']


In [17]:
fileList = [os.getcwd() + '/' + f for f in listdir(os.getcwd()) if (isfile(join(os.getcwd(), f)) and "pickle" in f)]

for f in fileList:
    targetIndex = f.rfind("/") + 1
    fileName = f[targetIndex:len(f)]

    print(fileName)

    params = pickle.load(open(f, 'rb'))

    score_list = []
    for x in params:
        score_list.append(x[1])

    ind = np.argmax(score_list)
        
    print(f"{params[ind][0]}, {params[ind][1]}: {params[ind][2]}")
    print("\n")


binary_params_DT.pickle
Binary Decision Tree All accuracy Fold 3, 0.998786996595658: OrderedDict([('criterion', 'entropy'), ('max_depth', 25), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 2)])


binary_params_KNN.pickle
Binary KNN Mutual Information accuracy Fold 1, 0.9914011334012937: OrderedDict([('algorithm', 'ball_tree'), ('n_neighbors', 4), ('weights', 'distance')])


binary_params_LR.pickle
Binary Logistic Regression All accuracy Fold 3, 0.9932879788100721: OrderedDict([('logisticregression__C', 0.2568671912905853), ('logisticregression__max_iter', 500)])


binary_params_NB.pickle
Binary Naive Bayes All accuracy Fold 3, 0.6620260531578819: OrderedDict([('var_smoothing', 1e-09)])


binary_params_RF.pickle
Binary Random Forest All accuracy Fold 2, 0.999460887302596: OrderedDict([('criterion', 'entropy'), ('max_depth', 100), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 7), ('n_estimators', 344)])


multiclass_params_DT.pickle
M

In [18]:
feature_sets = pickle.load(open("../feature_sets.pickle", 'rb'))

In [33]:
dt_bin = make_pipeline(FeatureReducer(), DecisionTreeClassifier(random_state = 42, criterion = 'entropy', max_depth = 25, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2))
dt_mult = make_pipeline(FeatureReducer(feature_list = feature_sets["Multiclass"]["RFE Sets"]["Decision Tree"]), DecisionTreeClassifier(random_state = 42, criterion = 'entropy', max_depth = 18, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 10))

knn_bin = make_pipeline(FeatureReducer(feature_list = feature_sets["Binary"]["Mutual Information"]), KNeighborsClassifier(algorithm = 'ball_tree', n_neighbors =  4, weights = 'distance'))
knn_mult = make_pipeline(FeatureReducer(feature_list = feature_sets["Multiclass"]["Mutual Information"]), KNeighborsClassifier(algorithm = 'ball_tree', n_neighbors = 7, weights =  'distance'))

lr_bin = make_pipeline(FeatureReducer(), StandardScaler(), LogisticRegression(C = 0.2568671912905853, max_iter = 500, random_state=42))
lr_mult = make_pipeline(FeatureReducer(feature_list = feature_sets["Multiclass"]["RFE Sets"]["Logistic Regression"]), StandardScaler(), LogisticRegression(C = 9972.476141278883, max_iter =  483, random_state=42))

nb_bin = make_pipeline(FeatureReducer(), GaussianNB(var_smoothing = 1e-09))
nb_mult = make_pipeline(FeatureReducer(feature_list = feature_sets["Multiclass"]["Correlation"]), GaussianNB(var_smoothing = 9.99835896078805e-08))

rf_bin = make_pipeline(FeatureReducer(), RandomForestClassifier(random_state=42, criterion = 'entropy', max_depth = 100, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 7, n_estimators = 344))
rf_mult = make_pipeline(FeatureReducer(), RandomForestClassifier(random_state=42, criterion = 'entropy', max_depth = 100, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 7, n_estimators = 344))

binary_classifiers = [(dt_bin, "Decision Tree"), (knn_bin, "KNN"), (lr_bin, "Logistic Regression"), (nb_bin, "Naive Bayes"), (rf_bin, "Random Forest")]
multiclass_classifiers = [(dt_mult, "Decision Tree"), (knn_mult, "KNN"), (lr_mult, "Logistic Regression"), (nb_mult, "Naive Bayes"), (rf_mult, "Random Forest")]


In [39]:
print("Binary Classifiers:")
for x in binary_classifiers:
    print(x[1])
    model = x[0]

    model.fit(bin_train_x, bin_train_y)

    y_pred = model.predict(bin_test_x)

    print(classification_report(bin_test_y, y_pred, digits = 6, target_names = binary_labels))

print("Multiclass Classifiers:")
for x in multiclass_classifiers:
    print(x[1])
    model = x[0]

    model.fit(multi_train_x, multi_train_y)

    y_pred = model.predict(multi_test_x)

    print(classification_report(multi_test_y, y_pred, digits = 6, target_names = multiclass_labels))

Binary Classifiers:
Decision Tree
              precision    recall  f1-score   support

      BENIGN   0.998167  0.998329  0.998248     18548
      ATTACK   0.998329  0.998167  0.998248     18550

    accuracy                       0.998248     37098
   macro avg   0.998248  0.998248  0.998248     37098
weighted avg   0.998248  0.998248  0.998248     37098

KNN
              precision    recall  f1-score   support

      BENIGN   0.993135  0.990565  0.991848     18548
      ATTACK   0.990590  0.993154  0.991870     18550

    accuracy                       0.991859     37098
   macro avg   0.991863  0.991859  0.991859     37098
weighted avg   0.991863  0.991859  0.991859     37098

Logistic Regression
              precision    recall  f1-score   support

      BENIGN   0.989930  0.996442  0.993175     18548
      ATTACK   0.996418  0.989865  0.993131     18550

    accuracy                       0.993153     37098
   macro avg   0.993174  0.993153  0.993153     37098
weighted avg   0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

         DNS   0.364865  0.104820  0.162855      3091
        LDAP   0.563423  0.715626  0.630469      3091
       MSSQL   0.552775  0.850534  0.670065      3091
         NTP   0.898577  0.959897  0.928225      3092
     NetBIOS   0.798246  0.382600  0.517272      3092
     Portmap   0.587833  0.806535  0.680033      3091
        SNMP   0.652872  0.683700  0.667930      3092
        SSDP   0.447510  0.671410  0.537059      3092
         Syn   0.985244  0.928826  0.956203      3091
        TFTP   0.987055  0.986736  0.986895      3091
         UDP   0.434381  0.304109  0.357755      3091
     UDP-lag   0.905945  0.719599  0.802091      3092

    accuracy                       0.676200     37097
   macro avg   0.681560  0.676199  0.658071     37097
weighted avg   0.681568  0.676200  0.658075     37097

Naive Bayes


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         DNS   0.159827  0.023940  0.041643      3091
        LDAP   0.124065  0.064380  0.084771      3091
       MSSQL   0.000000  0.000000  0.000000      3091
         NTP   0.000000  0.000000  0.000000      3092
     NetBIOS   0.949367  0.242561  0.386399      3092
     Portmap   1.000000  0.002265  0.004519      3091
        SNMP   0.330399  0.655563  0.439363      3092
        SSDP   0.061321  0.033635  0.043442      3092
         Syn   0.540964  0.892915  0.673746      3091
        TFTP   0.891704  0.956325  0.922885      3091
         UDP   0.162024  0.942737  0.276523      3091
     UDP-lag   0.000000  0.000000  0.000000      3092

    accuracy                       0.317842     37097
   macro avg   0.351639  0.317860  0.239441     37097
weighted avg   0.351628  0.317842  0.239432     37097

Random Forest
              precision    recall  f1-score   support

         DNS   0.729989  0.616629  0.668537      3091
        LD