In [1]:
"""
The code for testing the classifiers. Uses binary.csv and multiclass.csv

Loads feature lists and tuning ranges from pickles in CWD.

Change model types in models to tune on specific ones.

Writes output files.

Author: Wesley
"""

# Accelerates tuning of some classifiers
from sklearnex import patch_sklearn
patch_sklearn()

import pandas as pd
import numpy as np

from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, Real

from sklearn.feature_selection import RFECV

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import pickle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    accuracy_score
)

from time import time

from sklearn.metrics import classification_report

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
multiclass = pd.read_csv("multiclass_train.csv")
multiclass_test = pd.read_csv("multiclass_test.csv")

Preprocessing (make labels numeric)

In [3]:
# Encode attack labels to int and save as array to be used later.
le = LabelEncoder()
multiclass[" Label"] = le.fit_transform(multiclass[" Label"].values)
multiclass_test[" Label"] = le.transform(multiclass_test[" Label"].values)

multiclass_labels = []
print("\nMulticlass Label Encodings (in order of digits 0 -> n): ")
for i in range(0, len(list(set(list(multiclass[' Label']))))):
    multiclass_labels.append(le.inverse_transform([i])[0])

print(multiclass_labels)


Multiclass Label Encodings (in order of digits 0 -> n): 
['DNS', 'LDAP', 'MSSQL', 'NTP', 'NetBIOS', 'Portmap', 'SNMP', 'SSDP', 'Syn', 'TFTP', 'UDP', 'UDP-lag']


Load feature sets and search spaces and enumerate their contents.

In [4]:
feature_sets = pickle.load(open("feature_sets.pickle", 'rb'))
search_spaces = pickle.load(open("hyperparameter_search_spaces.pickle", 'rb'))

print(f"Available Tuning Ranges: {search_spaces.keys()}")

print("Feature Sets for Binary Dataset:")
for key, value in feature_sets["Binary"].items():
    if key == "RFE Sets":
        print(value.keys())

    elif key == "PCA":
        print(f"{key}, suggested variance threshold is {value}")
        
    else:
        print(key)

print("Feature Sets for Multiclass Dataset:")
for key, value in feature_sets["Multiclass"].items():
    if key == "RFE Sets":
        print(value.keys())

    elif key == "PCA":
        print(f"{key}, suggested variance threshold is {value}")

    else:
        print(key)

Available Tuning Ranges: dict_keys(['XGBoost', 'Bagging SVM', 'SVC (RBF)', 'SVC (Poly)', 'Logistic Regression', 'Random Forest', 'KNN', 'Linear SVC', 'Naive Bayes', 'Decision Tree'])
Feature Sets for Binary Dataset:
All
Correlation
Mutual Information
dict_keys(['Decision Tree', 'Random Forest', 'XGBoost', 'Linear SVC', 'Logistic Regression'])
PCA, suggested variance threshold is 0.95
Feature Sets for Multiclass Dataset:
All
Correlation
Mutual Information
dict_keys(['Decision Tree', 'Random Forest', 'XGBoost', 'Logistic Regression'])
PCA, suggested variance threshold is 0.95


In [5]:
"""
This is a helper method to place our performance results in a DataFrame for future analysis.
"""
def format_results_multiclass(y_test, predicted_values, fold_index, fitTime):
    # get scores
    accuracy = accuracy_score(y_test,predicted_values)
    recall_pos = recall_score(y_test, predicted_values, average='macro')
    precision_pos = precision_score(y_test,predicted_values, average='macro')
    f1 = f1_score(y_test,predicted_values, average='macro')

    cols = ["Fitting Time", "accuracy", "Precision", "Recall", "F1 Score"]
    results = [fitTime, accuracy, precision_pos, recall_pos, f1]

    outFrame = pd.DataFrame([results], columns=cols, index=[fold_index])

    return outFrame

This is for tuning on the multiclass set

In [6]:
models = {
    #"Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    #"XGBoost": xgb.XGBClassifier(random_state=42, num_class=12, objective='multi:softmax'),
    #"Linear SVC": make_pipeline(StandardScaler(), LinearSVC(random_state=42)),
    #"Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression(random_state=42)),
    #"KNN": KNeighborsClassifier(),
    #"Naive Bayes": GaussianNB(),
}

score_methods = ['accuracy']

feature_set = feature_sets["Multiclass"]

y = multiclass[" Label"].copy()
X = multiclass.drop([" Label"], axis=1)

y_test_f = multiclass[" Label"].copy()
X_test_f = multiclass.drop([" Label"], axis=1)

In [7]:
params = []

# This will hold all of our results.
runFrame = None

for name, model in models.items():
        for feature_key, feature_val in feature_set.items():

            # If we're on the RFE sets, check if we have one for this classifier. If not, skip it.
            if feature_key == "RFE Sets":
                if name in feature_val.keys():
                    feature_val = feature_val[name]
                else:
                    continue
            
            for score_method in score_methods:
                opt = BayesSearchCV(estimator=model,search_spaces=search_spaces[name],n_iter=50,scoring=score_method,cv=5,n_jobs=5)
                kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

                counter = 0

                # Used to hold data for a single run (performance metric)
                perfFrame = None

                for train_index, test_index in kf.split(X, y):

                    counter += 1

                    # PCA requires different logic to create X.
                    if feature_key != "PCA":
                        current_X = X.loc[:, feature_val]
                        X_train, X_test = current_X.iloc[train_index,:], current_X.iloc[test_index,:]
                        Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]

                        current_X_f = X_test_f.loc[:, feature_val]
                        
                    else:
                        pca_trans = PCA(n_components=feature_val, random_state=42)
                        current_X = X.loc[:, X.columns]
                        X_train, X_test = current_X.iloc[train_index,:], current_X.iloc[test_index,:]
                        Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]

                        # Apply PCA to training set and use it to transform test set.
                        X_train = pca_trans.fit_transform(X_train)
                        X_test = pca_trans.transform(X_test)
                        current_X_f = pca_trans.transform(X_test_f)

                        # Convert back to DataFrames
                        pca_cols = ["PC"+str(i) for i in list(range(1, len(X_train[0])+1))]
                        X_train = pd.DataFrame(data=X_train, columns=pca_cols)
                        X_test = pd.DataFrame(data=X_test, columns=pca_cols)
                        current_X_f = pd.DataFrame(data=current_X_f, columns=pca_cols)

                    startTime = time()

                    opt.fit(X_train,Y_train)

                    endTime = time()
                    fitTime = endTime - startTime

                    predicted_values = opt.predict(X_test)

                    # get metrics for this fold.
                    foldFrame = format_results_multiclass(Y_test, predicted_values, counter, fitTime)

                    # Add them to our lists of metric.
                    if perfFrame is None:
                        perfFrame = foldFrame
                    else:
                        perfFrame = pd.concat([perfFrame, foldFrame])

                    # Print a classification report on the testing results.
                    print("Validation Results: ")
                    print(classification_report(Y_test, predicted_values, target_names=multiclass_labels, digits=6))

                    print("Testing Results: ")
                    # Print a classification report on the testing results.
                    pred_test = opt.predict(current_X_f)
                    print(classification_report(y_test_f, pred_test, target_names=multiclass_labels, digits=6))

                    # Add tuple with the best params as well as the related model/config
                    params.append((f"Multiclass {name} {feature_key} {score_method} Fold {counter}", opt.best_params_))

                # Create a new line in the results table that averages all the folds
                perfFrame.loc["fold average"] = perfFrame.mean()

                # Mark the results table with the chosen classifier and the current performance metric.
                perfFrame['metric'] = [score_method for j in range(0,6)]
                perfFrame['Classifier'] = [name for j in range(0,6)]
                perfFrame['Feature Set'] = [feature_key for j in range(0,6)]
                perfFrame['Dataset'] = ["multiclass" for j in range(0,6)]
                print(f"{name} with {feature_key} and {score_method} completed.")

                # Add this run to the table with all runs.
                if runFrame is None:
                    runFrame = perfFrame
                else:
                    runFrame = pd.concat([runFrame, perfFrame])

# Write output file, best parameters, and best models to be used later.
runFrame.to_csv(f"multiclass_results_RF_{time()}.csv")
pickle.dump(params, open(f"multiclass_params_RF_{time()}.pickle", "wb"))

Validation Results: 
              precision    recall  f1-score   support

         DNS   0.766125  0.512186  0.613933      2667
        LDAP   0.615200  0.673791  0.643164      2667
       MSSQL   0.930997  0.956489  0.943571      2666
         NTP   0.989199  0.996249  0.992712      2666
     NetBIOS   0.755312  0.439985  0.556056      2666
     Portmap   0.611197  0.892723  0.725610      2666
        SNMP   0.695980  0.830896  0.757477      2667
        SSDP   0.545283  0.650169  0.593125      2667
         Syn   0.954341  0.556430  0.702984      2667
        TFTP   0.998869  0.993251  0.996052      2667
         UDP   0.564273  0.477315  0.517164      2667
     UDP-lag   0.635808  0.857839  0.730321      2666

    accuracy                       0.736429     31999
   macro avg   0.755215  0.736444  0.731014     31999
weighted avg   0.755211  0.736429  0.731005     31999

Testing Results: 
              precision    recall  f1-score   support

         DNS   0.786026  0.513013  0.62



Validation Results: 
              precision    recall  f1-score   support

         DNS   0.762388  0.478995  0.588344      2666
        LDAP   0.598347  0.678665  0.635980      2667
       MSSQL   0.924570  0.946757  0.935532      2667
         NTP   0.986999  0.996624  0.991788      2666
     NetBIOS   0.754821  0.410949  0.532168      2667
     Portmap   0.603103  0.903637  0.723398      2667
        SNMP   0.700156  0.841710  0.764435      2666
        SSDP   0.545948  0.679670  0.605514      2666
         Syn   0.983531  0.559805  0.713501      2667
        TFTP   0.996989  0.993251  0.995116      2667
         UDP   0.585018  0.477494  0.525816      2666
     UDP-lag   0.650155  0.866142  0.742765      2667

    accuracy                       0.736148     31999
   macro avg   0.757669  0.736142  0.729530     31999
weighted avg   0.757675  0.736148  0.729535     31999

Testing Results: 
              precision    recall  f1-score   support

         DNS   0.775230  0.486837  0.59



Validation Results: 
              precision    recall  f1-score   support

         DNS   0.766803  0.491939  0.599360      2667
        LDAP   0.604855  0.672918  0.637074      2666
       MSSQL   0.917326  0.956880  0.936686      2667
         NTP   0.992173  0.998125  0.995140      2667
     NetBIOS   0.772849  0.431196  0.553550      2667
     Portmap   0.609880  0.898350  0.726528      2666
        SNMP   0.707538  0.848462  0.771619      2666
        SSDP   0.539708  0.637284  0.584451      2666
         Syn   0.948222  0.570143  0.712111      2666
        TFTP   0.996602  0.990248  0.993415      2666
         UDP   0.562500  0.475816  0.515539      2667
     UDP-lag   0.640747  0.862017  0.735092      2667

    accuracy                       0.736109     31998
   macro avg   0.754934  0.736115  0.730047     31998
weighted avg   0.754938  0.736109  0.730046     31998

Testing Results: 
              precision    recall  f1-score   support

         DNS   0.777062  0.506638  0.61



Validation Results: 
              precision    recall  f1-score   support

         DNS   0.604545  0.598650  0.601583      2666
        LDAP   0.621139  0.482565  0.543153      2667
       MSSQL   0.884316  0.894263  0.889262      2667
         NTP   0.859413  0.933233  0.894803      2666
     NetBIOS   0.753492  0.404574  0.526470      2667
     Portmap   0.601003  0.899138  0.720445      2667
        SNMP   0.699871  0.814329  0.752774      2666
        SSDP   0.532082  0.659415  0.588945      2666
         Syn   0.965259  0.531309  0.685369      2667
        TFTP   0.986517  0.987627  0.987071      2667
         UDP   0.582345  0.460240  0.514142      2666
     UDP-lag   0.635560  0.863142  0.732072      2667

    accuracy                       0.710710     31999
   macro avg   0.727128  0.710707  0.703007     31999
weighted avg   0.727140  0.710710  0.703012     31999

Testing Results: 
              precision    recall  f1-score   support

         DNS   0.622105  0.612540  0.61

In [8]:
best = pickle.load(open("multiclass_params_RF_1670691547.2840455.pickle", 'rb'))

In [12]:
final_feature_set = None
for key, value in feature_sets["Multiclass"].items():
    if key == "RFE Sets":
        final_feature_set = value['Random Forest']
        break

print(final_feature_set)

X_fin = X.loc[:, final_feature_set]
x_test_fin = X_test_f.loc[:, final_feature_set]

[' Protocol', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Mean', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count', ' Down/Up Ratio', ' Average Packet Size', ' Avg Fwd Segment Size', ' Avg Bwd Segment Size', ' Fwd Header Length.1', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', ' Subflow Bwd Pack

In [10]:
for i in range(15, 20):
    print(best[i])

('Multiclass Random Forest RFE Sets accuracy Fold 1', OrderedDict([('criterion', 'entropy'), ('max_depth', 14), ('max_features', 'log2'), ('min_samples_leaf', 3), ('min_samples_split', 12), ('n_estimators', 231)]))
('Multiclass Random Forest RFE Sets accuracy Fold 2', OrderedDict([('criterion', 'entropy'), ('max_depth', 16), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 500)]))
('Multiclass Random Forest RFE Sets accuracy Fold 3', OrderedDict([('criterion', 'entropy'), ('max_depth', 15), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 20), ('n_estimators', 500)]))
('Multiclass Random Forest RFE Sets accuracy Fold 4', OrderedDict([('criterion', 'entropy'), ('max_depth', 68), ('max_features', 'sqrt'), ('min_samples_leaf', 3), ('min_samples_split', 13), ('n_estimators', 83)]))
('Multiclass Random Forest RFE Sets accuracy Fold 5', OrderedDict([('criterion', 'entropy'), ('max_depth', 65), ('max_features', 'sqrt'), ('m

In [14]:
clf = RandomForestClassifier(criterion = 'entropy', max_depth = 65, max_features = 'sqrt', min_samples_leaf = 2, min_samples_split = 18, n_estimators = 478, random_state=42)
clf.fit(X_fin, y)
pred_test = clf.predict(x_test_fin)
print(classification_report(y_test_f, pred_test, target_names=multiclass_labels, digits=6))

              precision    recall  f1-score   support

         DNS   0.780530  0.512938  0.619054     13333
        LDAP   0.617160  0.687842  0.650587     13333
       MSSQL   0.934808  0.957174  0.945859     13333
         NTP   0.991958  0.999250  0.995591     13332
     NetBIOS   0.781006  0.427436  0.552496     13333
     Portmap   0.611472  0.911498  0.731932     13333
        SNMP   0.710599  0.850885  0.774440     13332
        SSDP   0.663402  0.782328  0.717973     13332
         Syn   0.979028  0.574214  0.723869     13333
        TFTP   0.997362  0.992425  0.994887     13333
         UDP   0.727899  0.618165  0.668559     13333
     UDP-lag   0.656845  0.887797  0.755055     13333

    accuracy                       0.766827    159993
   macro avg   0.787672  0.766829  0.760859    159993
weighted avg   0.787672  0.766827  0.760857    159993

