In [2]:
import pandas as pd
from tqdm import tqdm
from pyarrow import csv
import pyarrow as pa

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import plotly.graph_objects as go

from catboost import CatBoostClassifier
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler
from sklearn.metrics import auc, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve

In [3]:
datasets = {
    'NF-UNSW-NB15-V2' : './NF-UNSW-NB15-V2.parquet',
}

features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack', 'Label']

scaler = MinMaxScaler()

x_train = {}
x_val = {}
x_test = {}

In [4]:
for key, value in datasets.items():
    print(f'Processing {key}')
    print('='*20 + '\n')
    df = pd.read_parquet(value)
    Y = df.Label
    df = df[['IN_PKTS', 'IN_BYTES', 'TCP_FLAGS', 'PROTOCOL','L4_SRC_PORT', 'L4_DST_PORT', 'Attack', 'Label']]
    X_train, X_test, y_train, y_val = train_test_split(df, Y, test_size=0.3,stratify=df.Attack, random_state=42)
    del df
    del Y
    gc.collect()
    X_val, X_test, y_val, y_test = train_test_split(X_test, X_test.Label, test_size=0.15, stratify=X_test.Attack, random_state=42)
    X_train = X_train[X_train.Label==0].drop(columns=features_to_remove, axis=1)
    X_val.drop(columns=features_to_remove, axis=1, inplace=True)
    X_test.drop(columns=features_to_remove, axis=1, inplace=True)
    print(X_val)
    x_train[key] = scaler.fit_transform(X_train)
    x_val[key] = (scaler.transform(X_val), y_val)
    x_test[key] = (scaler.transform(X_test), y_test)
    del X_train
    del X_val
    del X_test
    gc.collect()
print()
print('Finished processing data sources.\n')

Processing NF-UNSW-NB15-V2

         IN_PKTS  IN_BYTES  TCP_FLAGS  PROTOCOL
839624        45      2487         25         6
1745550        6       320         27         6
1423092       48      2974         27         6
736359        54      3302         27         6
497973         6       320         27         6
...          ...       ...        ...       ...
68583         23      1251         24         6
253644        44      2750         27         6
32452         23      1251         24         6
1376770       48      2958         27         6
178848        44      2750         27         6

[506620 rows x 4 columns]

Finished processing data sources.



In [5]:
x_val['NF-UNSW-NB15-V2'][0].shape

(506620, 4)

In [6]:
from catboost import CatBoostClassifier

In [7]:
y_val

839624     0
1745550    0
1423092    0
736359     0
497973     0
          ..
68583      0
253644     0
32452      0
1376770    0
178848     0
Name: Label, Length: 506620, dtype: int8

In [8]:
x_test['NF-UNSW-NB15-V2'][1]

267949     0
1390945    0
1575231    0
1699375    0
975300     0
          ..
1010601    0
1313539    0
1427466    0
533840     0
665744     0
Name: Label, Length: 89404, dtype: int8

In [9]:
clf = CatBoostClassifier(random_seed=42, depth=4, custom_metric=['AUC'], iterations=100, learning_rate=0.2, subsample=0.95, bootstrap_type='Bernoulli')
clf.fit(x_val['NF-UNSW-NB15-V2'][0], x_val['NF-UNSW-NB15-V2'][1], verbose = False, early_stopping_rounds=100) 

<catboost.core.CatBoostClassifier at 0x7f1fad0c4eb0>

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [11]:
tn, fp, fn, tp = confusion_matrix(x_test['NF-UNSW-NB15-V2'][1], clf.predict(x_test['NF-UNSW-NB15-V2'][0])).ravel()

In [12]:
tn, fp, fn, tp

(85551, 474, 471, 2908)

In [82]:
accuracy_score(x_test['NF-UNSW-NB15-V2'][1], clf.predict(x_test['NF-UNSW-NB15-V2'][0]))

0.9925059281463917

In [14]:
clf.predict(x_test['NF-UNSW-NB15-V2'][0])[clf.predict(x_test['NF-UNSW-NB15-V2'][0]) == 1].shape

(3382,)

In [87]:
test_df = pd.read_csv('gathered_newer.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]

In [88]:
test = scaler.transform(test_df.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'}))

In [17]:
x_val['NF-UNSW-NB15-V2'][0]

array([[8.15872427e-03, 3.44502217e-04, 8.06451613e-01, 3.81679389e-02],
       [9.27127758e-04, 4.42060367e-05, 8.70967742e-01, 3.81679389e-02],
       [8.71500093e-03, 4.11989176e-04, 8.70967742e-01, 3.81679389e-02],
       ...,
       [4.07936214e-03, 1.73221147e-04, 7.74193548e-01, 3.81679389e-02],
       [8.71500093e-03, 4.09771945e-04, 8.70967742e-01, 3.81679389e-02],
       [7.97329872e-03, 3.80947946e-04, 8.70967742e-01, 3.81679389e-02]])

In [18]:
df = pd.read_parquet(value)

In [19]:
test

array([[1.85425552e-04, 7.41386509e-05, 0.00000000e+00, 1.22137405e-01],
       [1.85425552e-04, 1.08505726e-04, 0.00000000e+00, 1.22137405e-01],
       [3.70851103e-04, 1.85554493e-04, 0.00000000e+00, 1.22137405e-01],
       ...,
       [1.48340441e-03, 5.67333901e-04, 6.45161290e-02, 3.81679389e-02],
       [1.48340441e-03, 5.67333901e-04, 6.45161290e-02, 3.81679389e-02],
       [5.56276655e-04, 3.74157678e-05, 7.74193548e-01, 3.81679389e-02]])

In [20]:
df['TCP_FLAGS']

0          24
1          24
2          24
3          24
4          24
           ..
1986740    27
1986741    27
1986742    26
1986743    27
1986744    26
Name: TCP_FLAGS, Length: 1986745, dtype: int8

In [21]:
df.columns

Index(['L4_SRC_PORT', 'L4_DST_PORT', 'PROTOCOL', 'L7_PROTO', 'IN_BYTES',
       'IN_PKTS', 'OUT_BYTES', 'OUT_PKTS', 'TCP_FLAGS', 'CLIENT_TCP_FLAGS',
       'SERVER_TCP_FLAGS', 'FLOW_DURATION_MILLISECONDS', 'DURATION_IN',
       'DURATION_OUT', 'MIN_TTL', 'MAX_TTL', 'LONGEST_FLOW_PKT',
       'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'MAX_IP_PKT_LEN',
       'SRC_TO_DST_SECOND_BYTES', 'DST_TO_SRC_SECOND_BYTES',
       'RETRANSMITTED_IN_BYTES', 'RETRANSMITTED_IN_PKTS',
       'RETRANSMITTED_OUT_BYTES', 'RETRANSMITTED_OUT_PKTS',
       'SRC_TO_DST_AVG_THROUGHPUT', 'DST_TO_SRC_AVG_THROUGHPUT',
       'NUM_PKTS_UP_TO_128_BYTES', 'NUM_PKTS_128_TO_256_BYTES',
       'NUM_PKTS_256_TO_512_BYTES', 'NUM_PKTS_512_TO_1024_BYTES',
       'NUM_PKTS_1024_TO_1514_BYTES', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
       'ICMP_TYPE', 'ICMP_IPV4_TYPE', 'DNS_QUERY_ID', 'DNS_QUERY_TYPE',
       'DNS_TTL_ANSWER', 'FTP_COMMAND_RET_CODE', 'Label', 'Attack'],
      dtype='object')

In [22]:
# IN_PACKETS -> IN_PKTS
# IN_OCTETS -> IN_BYTES
# TCP_FLAGS -> TCP_FLAGS
# PROTO -> PROTOCOL

In [23]:
df[['IN_PKTS', 'IN_BYTES', 'TCP_FLAGS', 'PROTOCOL']]

Unnamed: 0,IN_PKTS,IN_BYTES,TCP_FLAGS,PROTOCOL
0,1,9,24,6
1,5,261,24,6
2,9,481,24,6
3,13,701,24,6
4,19,1031,24,6
...,...,...,...,...
1986740,12,1064,27,6
1986741,12,1064,27,6
1986742,10,994,26,6
1986743,68,4014,27,6


In [24]:
x_val

{'NF-UNSW-NB15-V2': (array([[8.15872427e-03, 3.44502217e-04, 8.06451613e-01, 3.81679389e-02],
         [9.27127758e-04, 4.42060367e-05, 8.70967742e-01, 3.81679389e-02],
         [8.71500093e-03, 4.11989176e-04, 8.70967742e-01, 3.81679389e-02],
         ...,
         [4.07936214e-03, 1.73221147e-04, 7.74193548e-01, 3.81679389e-02],
         [8.71500093e-03, 4.09771945e-04, 8.70967742e-01, 3.81679389e-02],
         [7.97329872e-03, 3.80947946e-04, 8.70967742e-01, 3.81679389e-02]]),
  839624     0
  1745550    0
  1423092    0
  736359     0
  497973     0
            ..
  68583      0
  253644     0
  32452      0
  1376770    0
  178848     0
  Name: Label, Length: 506620, dtype: int8)}

In [114]:
sum(clf.predict(test) == 0)

1706

In [115]:
sum(clf.predict(test) == 1)

18

In [27]:
x_val['NF-UNSW-NB15-V2'][0]

array([[8.15872427e-03, 3.44502217e-04, 8.06451613e-01, 3.81679389e-02],
       [9.27127758e-04, 4.42060367e-05, 8.70967742e-01, 3.81679389e-02],
       [8.71500093e-03, 4.11989176e-04, 8.70967742e-01, 3.81679389e-02],
       ...,
       [4.07936214e-03, 1.73221147e-04, 7.74193548e-01, 3.81679389e-02],
       [8.71500093e-03, 4.09771945e-04, 8.70967742e-01, 3.81679389e-02],
       [7.97329872e-03, 3.80947946e-04, 8.70967742e-01, 3.81679389e-02]])

In [28]:
test

array([[1.85425552e-04, 7.41386509e-05, 0.00000000e+00, 1.22137405e-01],
       [1.85425552e-04, 1.08505726e-04, 0.00000000e+00, 1.22137405e-01],
       [3.70851103e-04, 1.85554493e-04, 0.00000000e+00, 1.22137405e-01],
       ...,
       [1.48340441e-03, 5.67333901e-04, 6.45161290e-02, 3.81679389e-02],
       [1.48340441e-03, 5.67333901e-04, 6.45161290e-02, 3.81679389e-02],
       [5.56276655e-04, 3.74157678e-05, 7.74193548e-01, 3.81679389e-02]])

In [29]:
x_val['NF-UNSW-NB15-V2'][1]

839624     0
1745550    0
1423092    0
736359     0
497973     0
          ..
68583      0
253644     0
32452      0
1376770    0
178848     0
Name: Label, Length: 506620, dtype: int8

In [60]:
val_added = np.concatenate((x_val['NF-UNSW-NB15-V2'][0], test), axis=0)

In [66]:
val_added = np.concatenate((val_added, test), axis=0)

In [67]:
add_0 = np.zeros(val_added.shape[0]-x_val['NF-UNSW-NB15-V2'][1].shape[0])

In [61]:
add_1 = np.ones(val_added.shape[0]-x_val['NF-UNSW-NB15-V2'][1].shape[0])

In [32]:
x_val['NF-UNSW-NB15-V2'][1]

839624     0
1745550    0
1423092    0
736359     0
497973     0
          ..
68583      0
253644     0
32452      0
1376770    0
178848     0
Name: Label, Length: 506620, dtype: int8

In [62]:
val_Y_add = np.concatenate((x_val['NF-UNSW-NB15-V2'][1], add_1), axis=0)

In [68]:
val_Y_add = np.concatenate((val_Y_add, add_0), axis=0)

In [69]:
val_Y_add.shape

(519376,)

In [70]:
clf = CatBoostClassifier(random_seed=42, depth=4, custom_metric=['AUC'], iterations=1000, learning_rate=0.2, subsample=0.95, bootstrap_type='Bernoulli')
clf.fit(val_added, val_Y_add, verbose = False, early_stopping_rounds=100) 

CatBoostError: Length of label=519376 and length of data=518640 is different.

In [36]:
val_added

array([[8.15872427e-03, 3.44502217e-04, 8.06451613e-01, 3.81679389e-02],
       [9.27127758e-04, 4.42060367e-05, 8.70967742e-01, 3.81679389e-02],
       [8.71500093e-03, 4.11989176e-04, 8.70967742e-01, 3.81679389e-02],
       ...,
       [1.48340441e-03, 5.67333901e-04, 6.45161290e-02, 3.81679389e-02],
       [1.48340441e-03, 5.67333901e-04, 6.45161290e-02, 3.81679389e-02],
       [5.56276655e-04, 3.74157678e-05, 7.74193548e-01, 3.81679389e-02]])

In [37]:
val_Y_add

array([0., 0., 0., ..., 0., 0., 0.])

In [71]:
benign = pd.read_csv('benign.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]
benign = scaler.transform(benign.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'}))

ddos = pd.read_csv('hping3-2.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]
ddos = scaler.transform(ddos.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'}))

In [72]:
val_added = np.concatenate((x_val['NF-UNSW-NB15-V2'][0], benign), axis=0)
val_added = np.concatenate((val_added, ddos), axis=0)

In [77]:
add_0 = np.zeros(benign.shape[0])
add_1 = np.ones(ddos.shape[0])
val_Y_add = np.concatenate((x_val['NF-UNSW-NB15-V2'][1], add_0), axis=0)
val_Y_add = np.concatenate((val_Y_add, add_1), axis=0)

In [98]:
val_added.shape, val_Y_add.shape

((518640, 4), (518640,))

In [112]:
clf = CatBoostClassifier(random_seed=42, depth=4, custom_metric=['AUC'], iterations=1000, learning_rate=0.2, subsample=0.95, bootstrap_type='Bernoulli')
clf.fit(val_added, val_Y_add, verbose = False, early_stopping_rounds=100) 

<catboost.core.CatBoostClassifier at 0x7f1fa4030b80>

In [113]:
accuracy_score(x_test['NF-UNSW-NB15-V2'][1], clf.predict(x_test['NF-UNSW-NB15-V2'][0]))

0.9925059281463917

In [104]:
clf = CatBoostClassifier(random_seed=42, depth=4, custom_metric=['AUC'], iterations=1000, learning_rate=0.2, subsample=0.95, bootstrap_type='Bernoulli')
clf.fit(x_val['NF-UNSW-NB15-V2'][0], x_val['NF-UNSW-NB15-V2'][1], verbose = False, early_stopping_rounds=100) 

<catboost.core.CatBoostClassifier at 0x7f1fa4030250>

In [109]:
accuracy_score(x_test['NF-UNSW-NB15-V2'][1], clf.predict(x_test['NF-UNSW-NB15-V2'][0]))

0.9613887521811104

In [99]:
gathered = np.concatenate((benign, ddos), axis=0)

In [100]:
gathered_y = np.concatenate((add_0, add_1), axis=0)

In [108]:
clf = CatBoostClassifier(random_seed=42, depth=4, custom_metric=['AUC'], iterations=1000, learning_rate=0.2, subsample=0.95, bootstrap_type='Bernoulli')
clf.fit(gathered, gathered_y, verbose = False, early_stopping_rounds=100) 

<catboost.core.CatBoostClassifier at 0x7f1fa40301c0>

In [140]:
half_nfq_mask = np.concatenate((x_val['NF-UNSW-NB15-V2'][0], np.zeros(x_val['NF-UNSW-NB15-V2'][0].shape[0]).reshape((x_val['NF-UNSW-NB15-V2'][0].shape[0], 1))), axis=1) 

In [141]:
half_self_mask = np.concatenate((benign, ddos), axis=0)

In [143]:
half_self_mask

array([[1.85425552e-04, 7.41386509e-05, 0.00000000e+00, 1.22137405e-01],
       [1.85425552e-04, 1.08505726e-04, 0.00000000e+00, 1.22137405e-01],
       [3.70851103e-04, 1.85554493e-04, 0.00000000e+00, 1.22137405e-01],
       ...,
       [0.00000000e+00, 9.14607656e-06, 0.00000000e+00, 1.22137405e-01],
       [0.00000000e+00, 9.14607656e-06, 0.00000000e+00, 1.22137405e-01],
       [0.00000000e+00, 9.14607656e-06, 0.00000000e+00, 1.22137405e-01]])

In [145]:
half_self_mask = np.concatenate((half_self_mask, np.ones(half_self_mask.shape[0]).reshape((half_self_mask.shape[0], 1))), axis=1) 

In [146]:
half_self_mask

array([[1.85425552e-04, 7.41386509e-05, 0.00000000e+00, 1.22137405e-01,
        1.00000000e+00],
       [1.85425552e-04, 1.08505726e-04, 0.00000000e+00, 1.22137405e-01,
        1.00000000e+00],
       [3.70851103e-04, 1.85554493e-04, 0.00000000e+00, 1.22137405e-01,
        1.00000000e+00],
       ...,
       [0.00000000e+00, 9.14607656e-06, 0.00000000e+00, 1.22137405e-01,
        1.00000000e+00],
       [0.00000000e+00, 9.14607656e-06, 0.00000000e+00, 1.22137405e-01,
        1.00000000e+00],
       [0.00000000e+00, 9.14607656e-06, 0.00000000e+00, 1.22137405e-01,
        1.00000000e+00]])

In [148]:
masked_data = np.concatenate((half_nfq_mask, half_self_mask), axis=0)

In [149]:
masked_data.shape

(518640, 5)

In [150]:
val_Y_add.shape

(518640,)

In [155]:
clf = CatBoostClassifier(random_seed=42, depth=4, custom_metric=['AUC'], iterations=1000, learning_rate=0.2, subsample=0.95, bootstrap_type='Bernoulli')
clf.fit(masked_data, val_Y_add, verbose = False, early_stopping_rounds=100) 

<catboost.core.CatBoostClassifier at 0x7f1fa40302e0>

In [156]:
test_nfq_masked = np.concatenate((x_test['NF-UNSW-NB15-V2'][0], np.zeros(x_test['NF-UNSW-NB15-V2'][0].shape[0]).reshape((x_test['NF-UNSW-NB15-V2'][0].shape[0], 1))), axis=1) 

In [157]:
accuracy_score(x_test['NF-UNSW-NB15-V2'][1], clf.predict(test_nfq_masked))

0.9929421502393629

In [154]:
half_nfq_mask.shape

(506620, 5)

In [158]:
test_df = pd.read_csv('gathered_newer.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]

In [159]:
test_df = scaler.transform(test_df.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'}))

In [161]:
test_df_masked = np.concatenate((test_df, np.ones(test_df.shape[0]).reshape((test_df.shape[0], 1))), axis=1) 

In [163]:
sum(clf.predict(test_df_masked) == 0)

1717

In [164]:
sum(clf.predict(test_df_masked) == 1)

7