In [1]:
import pandas as pd
from tqdm import tqdm
from pyarrow import csv
import pyarrow as pa

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import plotly.graph_objects as go

from catboost import CatBoostClassifier
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler, StandardScaler
from sklearn.metrics import auc, accuracy_score, precision_score, recall_score, roc_curve, precision_recall_curve

In [2]:
datasets = {
    'NF-UNSW-NB15-V2' : './NF-UNSW-NB15-V2.parquet',
}

features_to_remove = ['L4_SRC_PORT', 'L4_DST_PORT', 'Attack', 'Label']

scaler = MinMaxScaler()

x_train = {}
x_val = {}
x_test = {}

In [3]:
for key, value in datasets.items():
    print(f'Processing {key}')
    print('='*20 + '\n')
    df = pd.read_parquet(value)
    Y = df.Label
    X_train, X_test, y_train, y_val = train_test_split(df, Y, test_size=0.3,stratify=df.Attack, random_state=42)
    del df
    del Y
    gc.collect()
    X_val, X_test, y_val, y_test = train_test_split(X_test, X_test.Label, test_size=0.15, stratify=X_test.Attack, random_state=42)
    X_train = X_train[X_train.Label==0].drop(columns=features_to_remove, axis=1)
    X_val.drop(columns=features_to_remove, axis=1, inplace=True)
    X_test.drop(columns=features_to_remove, axis=1, inplace=True)
    x_train[key] = scaler.fit_transform(X_train)
    x_val[key] = (scaler.transform(X_val), y_val)
    x_test[key] = (scaler.transform(X_test), y_test)
    del X_train
    del X_val
    del X_test
    gc.collect()
print()
print('Finished processing data sources.\n')

Processing NF-UNSW-NB15-V2


Finished processing data sources.



In [4]:
del x_train

In [5]:
test_nf_x, test_nf_y = x_test['NF-UNSW-NB15-V2']
train_nf_x, train_nf_y = x_val['NF-UNSW-NB15-V2'] 

In [6]:
test_nf_x, test_nf_y

(array([[3.81679389e-02, 0.00000000e+00, 5.86180361e-05, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [3.81679389e-02, 0.00000000e+00, 3.66535947e-04, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [3.81679389e-02, 0.00000000e+00, 4.59105328e-04, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        ...,
        [3.81679389e-02, 2.85714286e-02, 2.54551940e-03, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [3.81679389e-02, 0.00000000e+00, 1.43427110e-04, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [3.81679389e-02, 0.00000000e+00, 4.14206407e-04, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00]]),
 267949     0
 1390945    0
 1575231    0
 1699375    0
 975300     0
           ..
 1010601    0
 1313539    0
 1427466    0
 533840     0
 665744     0
 Name: Label, Length: 89404, dtype: int8)

In [7]:
def mask(df, value):
    mask_row = np.array(([np.array([value])] * df.shape[0]))
    df = np.concatenate((df, mask_row), axis=1)
    return df

def prep_data(df, value, pad_col, scaler):
    padding_row = np.array(([0] * (pad_col - df.shape[1])))
    padding = np.array([padding_row] * df.shape[0])
    
    padded_df = np.concatenate((df, padding), axis=1)
    padded_df = scaler.transform(padded_df)
    
    mask_row = np.array(([np.array([value])] * df.shape[0]))
    padded_df = np.concatenate((padded_df, mask_row), axis=1)
    return padded_df

In [8]:
benign = pd.read_csv('benign.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]
benign = benign.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'})
benign = prep_data(benign, 1, test_nf_x.shape[1], scaler)

ddos = pd.read_csv('hping3-2.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]
ddos = ddos.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'})
ddos = prep_data(ddos, 1, test_nf_x.shape[1], scaler)



In [9]:
gathered = np.concatenate((benign, ddos), axis=0)

In [10]:
gathered_y = np.concatenate((np.zeros(benign.shape[0]), np.ones(ddos.shape[0])), axis=0)

In [11]:
data_train_x = np.concatenate((mask(train_nf_x, 0), gathered), axis=0)
data_train_y = np.concatenate((train_nf_y, gathered_y), axis=0)

In [12]:
data_train_x.shape

(518640, 40)

In [13]:
data_train_y.shape

(518640,)

In [14]:
clf = CatBoostClassifier(random_seed=42, depth=4, custom_metric=['AUC'], iterations=1000, learning_rate=0.2, subsample=0.95, bootstrap_type='Bernoulli')
clf.fit(data_train_x, data_train_y, verbose = False, early_stopping_rounds=100) 

<catboost.core.CatBoostClassifier at 0x7f5f72acbb50>

In [15]:
accuracy_score(test_nf_y, clf.predict(mask(test_nf_x, 0)))

0.9966108898930697

In [16]:
test_benign = pd.read_csv('gathered_newer.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]
test_benign = test_benign.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'})
test_benign = prep_data(test_benign, 1, test_nf_x.shape[1], scaler)



In [17]:
sum(clf.predict(test_benign) == 0)

1689

In [18]:
sum(clf.predict(test_benign) == 1)

35

In [19]:
test_ddos = pd.read_csv('hping3-1.csv')[['IN_PACKETS', 'IN_OCTETS', 'TCP_FLAGS', 'PROTO']]
test_ddos = test_ddos.rename(columns={'IN_PACKETS': 'IN_PKTS', 'IN_OCTETS': 'IN_BYTES', 'TCP_FLAGS': 'TCP_FLAGS', 'PROTO': 'PROTOCOL'})
test_ddos = prep_data(test_ddos, 1, test_nf_x.shape[1], scaler)



In [20]:
sum(clf.predict(test_ddos) == 0)

109

In [21]:
sum(clf.predict(test_ddos) == 1)

833

In [22]:
clf = CatBoostClassifier(random_seed=42, depth=4, custom_metric=['AUC'], iterations=1000, learning_rate=0.2, subsample=0.95, bootstrap_type='Bernoulli')
clf.fit(train_nf_x, train_nf_y, verbose = False, early_stopping_rounds=100) 

<catboost.core.CatBoostClassifier at 0x7f5e700b85e0>

In [23]:
accuracy_score(test_nf_y, clf.predict(test_nf_x))

0.9967003713480381