In [14]:
import pandas as pd
from sklearn.feature_selection import RFECV

# G-Mean
from imblearn.metrics import  geometric_mean_score
from sklearn.metrics import make_scorer

# Balanced Bagging
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import LabelEncoder

import numpy as np
from sklearn.model_selection import StratifiedKFold

from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier

from sklearn.metrics import multilabel_confusion_matrix

from sklearn.metrics import classification_report

import xgboost as xgb

from skopt import BayesSearchCV
from skopt.space import Categorical, Integer, Real

In [3]:
df = pd.read_csv("test_poc.csv")

df = df.sample(frac=1)

df[' Label'].value_counts()

  df = pd.read_csv("test_poc.csv")


MSSQL      7000
Portmap    7000
UDP        7000
NetBIOS    7000
LDAP       7000
Syn        7000
BENIGN     7000
UDPLag     1873
Name:  Label, dtype: int64

In [4]:
df.head()
#print(df.columns)

Unnamed: 0.9,Unnamed: 0.8,Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,...,SimillarHTTP,Inbound,Label,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1
37994,37994,44522,172.16.0.5-192.168.50.4-61850-507-17,172.16.0.5,61850,192.168.50.4,507,17,2018-11-03 10:34:02.755114,1,...,0,1,MSSQL,35121.0,27121.0,20121.0,19121.0,11121.0,3121.0,
26479,26479,191606,172.16.0.5-192.168.50.4-648-34797-17,172.16.0.5,648,192.168.50.4,34797,17,2018-11-03 10:01:26.538009,0,...,0,1,Portmap,23606.0,15606.0,8606.0,7606.0,,,
7190,7190,75139,172.16.0.5-192.168.50.4-35719-43454-17,172.16.0.5,35719,192.168.50.4,43454,17,2018-11-03 10:52:44.663251,3,...,0,1,UDP,4317.0,,,,,,
28678,28678,320541,172.16.0.5-192.168.50.4-589-58680-17,172.16.0.5,589,192.168.50.4,58680,17,2018-11-03 10:01:49.021367,1,...,0,1,NetBIOS,25805.0,17805.0,10805.0,9805.0,1805.0,,
29300,29300,116384,172.16.0.5-192.168.50.4-663-41523-17,172.16.0.5,663,192.168.50.4,41523,17,2018-11-03 10:01:49.090571,1,...,0,1,NetBIOS,26427.0,18427.0,11427.0,10427.0,2427.0,,


In [5]:
drop_cols = [
    "Unnamed: 0.8",
    "Unnamed: 0.7",
    "Unnamed: 0.6",
    "Unnamed: 0.5",
    "Unnamed: 0.4",
    "Unnamed: 0.3",
    "Unnamed: 0.2",
    "Unnamed: 0.1",
    "Unnamed: 0",
    "Flow ID",
    " Source IP",
    " Destination IP",
    " Timestamp",
    " Source Port",
    " Destination Port"
]

df.drop(drop_cols, axis=1, inplace=True)
df = df.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

for col in df.columns:
    if col != " Label":
        df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna(axis=0)

In [6]:
df.head()

Unnamed: 0,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label
37994,17,1,2,0,862.0,0.0,431.0,431.0,431.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,MSSQL
7190,17,3,2,0,750.0,0.0,375.0,375.0,375.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,UDP
28678,17,1,2,0,422.0,0.0,211.0,211.0,211.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,NetBIOS
29300,17,1,2,0,458.0,0.0,229.0,229.0,229.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,NetBIOS
29798,17,1,2,0,458.0,0.0,229.0,229.0,229.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,NetBIOS


In [7]:
gmean_scorer = make_scorer(geometric_mean_score)
y = df.loc[:, " Label"].copy()
X = df.drop(labels=" Label", axis=1)

In [8]:
y.value_counts()

UDP        6992
LDAP       6985
MSSQL      6745
BENIGN     6728
Portmap    6708
NetBIOS    6683
Syn        6599
UDPLag     1873
Name:  Label, dtype: int64

In [9]:
le = LabelEncoder()

y_encoded = le.fit_transform(y)


np.unique(y_encoded)

array([0, 1, 2, 3, 4, 5, 6, 7])

In [10]:
rfecv = RFECV(
            estimator=xgb.XGBClassifier(objective="multi:softmax", num_class=8),
            scoring=gmean_scorer,
            n_jobs=5,
        )
    
rfecv.fit(X, y_encoded)



# Pull features and relevant metrics
scores = rfecv.grid_scores_.tolist()
features = X.columns[rfecv.support_].tolist()
ranks = rfecv.ranking_.tolist()

# Assemble results into dictionary.
outDict = {
    "features": features,
    "feature importances": scores,
    "ranking": ranks
        
}



In [11]:
print(len(outDict['features']))
print(outDict['features'])

57
[' Protocol', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Fwd Header Length', ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' SYN Flag Count', ' ACK Flag Count', ' URG Flag Count', ' CWE Flag Count', ' Down/Up Ratio', ' Average Packet Size', 'Init_Win_bytes_forward', ' Init_Win_bytes_backward', ' act_data_pkt_fwd'

In [12]:
current_X = X.loc[:, outDict["features"]]
y_encoded = pd.Series(y_encoded) 

In [18]:

    model = {
        "estimator": xgb.XGBClassifier(objective="multi:softmax", num_class=8),
        "search_spaces": {
            "max_depth": Integer(1, 10),
            "gamma": Real(0.1, 10, prior="log-uniform"),
            "subsample": Real(0.5, 1, prior="log-uniform"),
            "min_child_weight": Integer(1, 10),
            "colsample_bytree": Real(0.5, 1, prior="log-uniform"),
            "learning_rate": Real(0.1, 1, prior="log-uniform"),
            "max_delta_step": Integer(0, 10),
            "lambda": Integer(1, 3),
            "alpha": Integer(0, 2),
        },
    }


bayes_search_cv = BayesSearchCV(estimator=model["estimator"],search_spaces=model["search_spaces"],n_iter=50,scoring=gmean_scorer,cv=5,n_jobs=5)

In [22]:
list(le.inverse_transform([0,1,2,3,4,5,6,7]))

['BENIGN', 'LDAP', 'MSSQL', 'NetBIOS', 'Portmap', 'Syn', 'UDP', 'UDPLag']

In [23]:
k = 5
kf = StratifiedKFold(n_splits=k, random_state=None)

counter = 0

for train_index, test_index in kf.split(current_X, y_encoded):
            X_train, X_test = current_X.iloc[train_index,:], current_X.iloc[test_index,:]
            y_train, y_test = y_encoded.iloc[train_index], y_encoded.iloc[test_index]

            bayes_search_cv.fit(X_train,y_train)

            predicted_values = bayes_search_cv.predict(X_test)

            cm = multilabel_confusion_matrix(y_test, predicted_values)

            counter +=1
            print(f"Fold {counter}:\n")
            print(cm)
            print(classification_report(y_test, predicted_values))

Fold 1:

[[[8510    8]
  [   0 1345]]

 [[8437   29]
  [  14 1383]]

 [[8486   28]
  [  63 1286]]

 [[8527    0]
  [   0 1336]]

 [[8479   42]
  [  25 1317]]

 [[8543    0]
  [   0 1320]]

 [[8397   67]
  [  39 1360]]

 [[9452   36]
  [  69  306]]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1345
           1       0.98      0.99      0.98      1397
           2       0.98      0.95      0.97      1349
           3       1.00      1.00      1.00      1336
           4       0.97      0.98      0.98      1342
           5       1.00      1.00      1.00      1320
           6       0.95      0.97      0.96      1399
           7       0.89      0.82      0.85       375

    accuracy                           0.98      9863
   macro avg       0.97      0.96      0.97      9863
weighted avg       0.98      0.98      0.98      9863

Fold 2:

[[[8512    6]
  [   0 1345]]

 [[8442   24]
  [  13 1384]]

 [[8489   25]
  [  68 1281]]

 