In [46]:
"""
The code for testing the classifiers. Uses binary.csv and multiclass.csv

Loads feature lists and tuning ranges from pickles in CWD.

Change model types in models to tune on specific ones.

Writes output files.

Author: Wesley
"""

# Accelerates tuning of some classifiers
from sklearnex import patch_sklearn
patch_sklearn()

import pandas as pd
import numpy as np

from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, Real

from sklearn.feature_selection import RFECV

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import pickle

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    accuracy_score
)

from time import time

from sklearn.metrics import classification_report

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [47]:
multiclass = pd.read_csv("multiclass_train.csv")
multiclass_test = pd.read_csv("multiclass_test.csv")

Preprocessing (make labels numeric)

In [48]:
# Encode attack labels to int and save as array to be used later.
le = LabelEncoder()
multiclass[" Label"] = le.fit_transform(multiclass[" Label"].values)
multiclass_test[" Label"] = le.transform(multiclass_test[" Label"].values)

multiclass_labels = []
print("\nMulticlass Label Encodings (in order of digits 0 -> n): ")
for i in range(0, len(list(set(list(multiclass[' Label']))))):
    multiclass_labels.append(le.inverse_transform([i])[0])

print(multiclass_labels)


Multiclass Label Encodings (in order of digits 0 -> n): 
['DNS', 'LDAP', 'MSSQL', 'NTP', 'NetBIOS', 'Portmap', 'SNMP', 'SSDP', 'Syn', 'TFTP', 'UDP', 'UDP-lag']


Load feature sets and search spaces and enumerate their contents.

In [49]:
feature_sets = pickle.load(open("feature_sets.pickle", 'rb'))
search_spaces = pickle.load(open("hyperparameter_search_spaces.pickle", 'rb'))

print(f"Available Tuning Ranges: {search_spaces.keys()}")

print("Feature Sets for Binary Dataset:")
for key, value in feature_sets["Binary"].items():
    if key == "RFE Sets":
        print(value.keys())

    elif key == "PCA":
        print(f"{key}, suggested variance threshold is {value}")
        
    else:
        print(key)

print("Feature Sets for Multiclass Dataset:")
for key, value in feature_sets["Multiclass"].items():
    if key == "RFE Sets":
        print(value.keys())

    elif key == "PCA":
        print(f"{key}, suggested variance threshold is {value}")

    else:
        print(key)

Available Tuning Ranges: dict_keys(['XGBoost', 'Bagging SVM', 'SVC (RBF)', 'SVC (Poly)', 'Logistic Regression', 'Random Forest', 'KNN', 'Linear SVC', 'Naive Bayes', 'Decision Tree'])
Feature Sets for Binary Dataset:
All
Correlation
Mutual Information
dict_keys(['Decision Tree', 'Random Forest', 'XGBoost', 'Linear SVC', 'Logistic Regression'])
PCA, suggested variance threshold is 0.95
Feature Sets for Multiclass Dataset:
All
Correlation
Mutual Information
dict_keys(['Decision Tree', 'Random Forest', 'XGBoost', 'Logistic Regression'])
PCA, suggested variance threshold is 0.95


In [51]:
"""
This is a helper method to place our performance results in a DataFrame for future analysis.
"""
def format_results_multiclass(y_test, predicted_values, fold_index, fitTime):
    # get scores
    accuracy = accuracy_score(y_test,predicted_values)
    recall_pos = recall_score(y_test, predicted_values, average='macro')
    precision_pos = precision_score(y_test,predicted_values, average='macro')
    f1 = f1_score(y_test,predicted_values, average='macro')

    cols = ["Fitting Time", "accuracy", "Precision", "Recall", "F1 Score"]
    results = [fitTime, accuracy, precision_pos, recall_pos, f1]

    outFrame = pd.DataFrame([results], columns=cols, index=[fold_index])

    return outFrame

This is for tuning on the multiclass set

In [52]:
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    #"Random Forest": RandomForestClassifier(random_state=42),
    #"XGBoost": xgb.XGBClassifier(random_state=42, num_class=11, objective='multi:softmax'),
    #"Linear SVC": make_pipeline(StandardScaler(), LinearSVC(random_state=42)),
    #"Logistic Regression": make_pipeline(StandardScaler(), LogisticRegression(random_state=42)),
    #"KNN": KNeighborsClassifier(),
    #"Naive Bayes": GaussianNB(),
}

score_methods = ['accuracy']

feature_set = feature_sets["Multiclass"]

y = multiclass[" Label"].copy()
X = multiclass.drop([" Label"], axis=1)

y_test_f = multiclass[" Label"].copy()
X_test_f = multiclass.drop([" Label"], axis=1)

In [22]:
params = []

# This will hold all of our results.
runFrame = None

for name, model in models.items():
        for feature_key, feature_val in feature_set.items():

            # If we're on the RFE sets, check if we have one for this classifier. If not, skip it.
            if feature_key == "RFE Sets":
                if name in feature_val.keys():
                    feature_val = feature_val[name]
                else:
                    continue
            
            for score_method in score_methods:
                opt = BayesSearchCV(estimator=model,search_spaces=search_spaces[name],n_iter=50,scoring=score_method,cv=5,n_jobs=5)
                kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

                counter = 0

                # Used to hold data for a single run (performance metric)
                perfFrame = None

                for train_index, test_index in kf.split(X, y):

                    counter += 1

                    # PCA requires different logic to create X.
                    if feature_key != "PCA":
                        current_X = X.loc[:, feature_val]
                        X_train, X_test = current_X.iloc[train_index,:], current_X.iloc[test_index,:]
                        Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]

                        current_X_f = X_test_f.loc[:, feature_val]
                        
                    else:
                        pca_trans = PCA(n_components=feature_val, random_state=42)
                        current_X = X.loc[:, X.columns]
                        X_train, X_test = current_X.iloc[train_index,:], current_X.iloc[test_index,:]
                        Y_train, Y_test = y.iloc[train_index], y.iloc[test_index]

                        # Apply PCA to training set and use it to transform test set.
                        X_train = pca_trans.fit_transform(X_train)
                        X_test = pca_trans.transform(X_test)
                        current_X_f = pca_trans.transform(X_test_f)

                        # Convert back to DataFrames
                        pca_cols = ["PC"+str(i) for i in list(range(1, len(X_train[0])+1))]
                        X_train = pd.DataFrame(data=X_train, columns=pca_cols)
                        X_test = pd.DataFrame(data=X_test, columns=pca_cols)
                        current_X_f = pd.DataFrame(data=current_X_f, columns=pca_cols)

                    startTime = time()

                    opt.fit(X_train,Y_train)

                    endTime = time()
                    fitTime = endTime - startTime

                    predicted_values = opt.predict(X_test)

                    # get metrics for this fold.
                    foldFrame = format_results_multiclass(Y_test, predicted_values, counter, fitTime)

                    # Add them to our lists of metric.
                    if perfFrame is None:
                        perfFrame = foldFrame
                    else:
                        perfFrame = pd.concat([perfFrame, foldFrame])

                    # Print a classification report on the testing results.
                    print("Validation Results: ")
                    print(classification_report(Y_test, predicted_values, target_names=multiclass_labels, digits=6))

                    print("Testing Results: ")
                    # Print a classification report on the testing results.
                    pred_test = opt.predict(current_X_f)
                    print(classification_report(y_test_f, pred_test, target_names=multiclass_labels, digits=6))

                    # Add tuple with the best params as well as the related model/config
                    params.append((f"Multiclass {name} {feature_key} {score_method} Fold {counter}", opt.best_params_))

                # Create a new line in the results table that averages all the folds
                perfFrame.loc["fold average"] = perfFrame.mean()

                # Mark the results table with the chosen classifier and the current performance metric.
                perfFrame['metric'] = [score_method for j in range(0,6)]
                perfFrame['Classifier'] = [name for j in range(0,6)]
                perfFrame['Feature Set'] = [feature_key for j in range(0,6)]
                perfFrame['Dataset'] = ["multiclass" for j in range(0,6)]
                print(f"{name} with {feature_key} and {score_method} completed.")

                # Add this run to the table with all runs.
                if runFrame is None:
                    runFrame = perfFrame
                else:
                    runFrame = pd.concat([runFrame, perfFrame])

# Write output file, best parameters, and best models to be used later.
runFrame.to_csv(f"multiclass_results_DT_{time()}.csv")
pickle.dump(params, open(f"multiclass_params_DT_{time()}.pickle", "wb"))

Validation Results: 
              precision    recall  f1-score   support

         DNS   0.747622  0.500937  0.599910      2667
        LDAP   0.609806  0.671541  0.639186      2667
       MSSQL   0.925695  0.948612  0.937014      2666
         NTP   0.989548  0.994374  0.991955      2666
     NetBIOS   0.749513  0.433233  0.549085      2666
     Portmap   0.607853  0.894224  0.723740      2666
        SNMP   0.692718  0.827522  0.754143      2667
        SSDP   0.505972  0.794151  0.618123      2667
         Syn   0.990358  0.539183  0.698228      2667
        TFTP   0.998491  0.992126  0.995298      2667
         UDP   0.558689  0.274841  0.368434      2667
     UDP-lag   0.639281  0.866842  0.735870      2666

    accuracy                       0.728117     31999
   macro avg   0.751296  0.728132  0.717582     31999
weighted avg   0.751291  0.728117  0.717571     31999

Testing Results: 
              precision    recall  f1-score   support

         DNS   0.766974  0.499887  0.60



Validation Results: 
              precision    recall  f1-score   support

         DNS   0.731462  0.517810  0.606367      2667
        LDAP   0.614125  0.675169  0.643202      2666
       MSSQL   0.930894  0.944507  0.937651      2667
         NTP   0.991773  0.994376  0.993072      2667
     NetBIOS   0.755659  0.425572  0.544495      2667
     Portmap   0.605190  0.892013  0.721128      2667
        SNMP   0.704328  0.830083  0.762052      2666
        SSDP   0.540970  0.740435  0.625178      2666
         Syn   0.975317  0.548387  0.702041      2666
        TFTP   0.994737  0.992498  0.993616      2666
         UDP   0.588846  0.403976  0.479199      2666
     UDP-lag   0.642877  0.864642  0.737448      2667

    accuracy                       0.735796     31998
   macro avg   0.756348  0.735789  0.728787     31998
weighted avg   0.756352  0.735796  0.728793     31998

Testing Results: 
              precision    recall  f1-score   support

         DNS   0.766496  0.517513  0.61



Validation Results: 
              precision    recall  f1-score   support

         DNS   0.744880  0.477494  0.581943      2666
        LDAP   0.596154  0.674166  0.632764      2667
       MSSQL   0.922455  0.941132  0.931700      2667
         NTP   0.991035  0.995124  0.993075      2666
     NetBIOS   0.772478  0.370454  0.500760      2667
     Portmap   0.591935  0.924634  0.721791      2667
        SNMP   0.694789  0.840210  0.760611      2666
        SSDP   0.535466  0.722056  0.614918      2666
         Syn   0.962001  0.569554  0.715497      2667
        TFTP   0.996608  0.991376  0.993985      2667
         UDP   0.587107  0.413353  0.485142      2666
     UDP-lag   0.649443  0.853018  0.737439      2667

    accuracy                       0.731054     31999
   macro avg   0.753696  0.731048  0.722469     31999
weighted avg   0.753703  0.731054  0.722474     31999

Testing Results: 
              precision    recall  f1-score   support

         DNS   0.764953  0.492087  0.59



Validation Results: 
              precision    recall  f1-score   support

         DNS   0.734670  0.467367  0.571298      2666
        LDAP   0.590464  0.677915  0.631175      2667
       MSSQL   0.928413  0.943736  0.936012      2666
         NTP   0.992129  0.992873  0.992501      2666
     NetBIOS   0.595632  0.756939  0.666667      2666
     Portmap   0.682678  0.527559  0.595178      2667
        SNMP   0.706068  0.837645  0.766249      2667
        SSDP   0.529972  0.709411  0.606702      2667
         Syn   0.951125  0.554556  0.700616      2667
        TFTP   0.996985  0.991751  0.994361      2667
         UDP   0.577839  0.391076  0.466458      2667
     UDP-lag   0.626398  0.861590  0.725407      2666

    accuracy                       0.726023     31999
   macro avg   0.742698  0.726035  0.721052     31999
weighted avg   0.742693  0.726023  0.721043     31999

Testing Results: 
              precision    recall  f1-score   support

         DNS   0.758056  0.509938  0.60

In [25]:
best = pickle.load(open("multiclass_params_DT_1670610708.3138227.pickle", 'rb'))

In [34]:
for i in range(6, 10):
    print(best[i])

('Multiclass Decision Tree Correlation accuracy Fold 2', OrderedDict([('criterion', 'gini'), ('max_depth', 17), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 2)]))
('Multiclass Decision Tree Correlation accuracy Fold 3', OrderedDict([('criterion', 'gini'), ('max_depth', 15), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 2)]))
('Multiclass Decision Tree Correlation accuracy Fold 4', OrderedDict([('criterion', 'gini'), ('max_depth', 18), ('max_features', 'sqrt'), ('min_samples_leaf', 1), ('min_samples_split', 10)]))
('Multiclass Decision Tree Correlation accuracy Fold 5', OrderedDict([('criterion', 'gini'), ('max_depth', 17), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 2)]))


In [54]:
final_feature_set = None
for key, value in feature_sets["Multiclass"].items():
    if key == "Correlation":
        final_feature_set = value
        break

print(final_feature_set)

X_fin = X.loc[:, final_feature_set]
x_test_fin = X_test_f.loc[:, final_feature_set]



[' Protocol', ' Flow Duration', 'Total Length of Fwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Std', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', 'Bwd IAT Total', ' Bwd IAT Min', 'Fwd PSH Flags', 'Fwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' RST Flag Count', ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count', ' Down/Up Ratio', ' Average Packet Size', ' Avg Fwd Segment Size', ' Avg Bwd Segment Size', ' Subflow Fwd Bytes', 'Init_Win_bytes_forward', 'Idle Mean', ' Idle Max', ' Idle Min', ' Inbound', ' Fwd Header Length', ' Fwd Header Length.1', ' min_seg_size_forward', ' Total Fwd Packets', 'Subflow Fwd Packets', ' act_data_pkt_fwd']


In [67]:
clf = DecisionTreeClassifier(random_state=42, criterion = 'gini', max_depth = 19, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 3)
clf.fit(X_fin, y)
pred_test = clf.predict(x_test_fin)
print(classification_report(y_test_f, pred_test, target_names=multiclass_labels, digits=6))

              precision    recall  f1-score   support

         DNS   0.783970  0.546539  0.644069     13333
        LDAP   0.623122  0.696917  0.657957     13333
       MSSQL   0.955331  0.962424  0.958864     13333
         NTP   0.996632  0.998875  0.997752     13332
     NetBIOS   0.754892  0.457136  0.569440     13333
     Portmap   0.616356  0.885772  0.726903     13333
        SNMP   0.726267  0.846985  0.781994     13332
        SSDP   0.593582  0.803330  0.682709     13332
         Syn   0.991576  0.556214  0.712666     13333
        TFTP   0.997514  0.993175  0.995340     13333
         UDP   0.699824  0.478062  0.568067     13333
     UDP-lag   0.649585  0.891772  0.751652     13333

    accuracy                       0.759764    159993
   macro avg   0.782388  0.759767  0.753951    159993
weighted avg   0.782388  0.759764  0.753950    159993

