# Technical Exploraton 4
Build proposed architecture in [Network Intrusion Detection System using Deep Learning](https://www.sciencedirect.com/science/article/pii/S1877050921011078)

In [None]:
# Setup
%matplotlib inline

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import  GridSearchCV

import tensorflow as tf
from keras import datasets, layers, models

## Import and explore data
We're going to be using the [UNSW-NB15 dataset](https://research.unsw.edu.au/projects/unsw-nb15-dataset). These papers elaborate its creation
- Moustafa, Nour, and Jill Slay. "UNSW-NB15: a comprehensive data set for network intrusion detection
systems (UNSW-NB15 network data set)."Military Communications and Information Systems Conference
(MilCIS), 2015. IEEE, 2015.
- Moustafa, Nour, and Jill Slay. "The evaluation of Network Anomaly Detection Systems: Statistical analysis
of the UNSW-NB15 data set and the comparison with the KDD99 data set." Information Security Journal:
A Global Perspective (2016): 1-14.

In [77]:
root = "./UNSW-NB15"

def read_UNSW_NB15(filename):
    return pd.read_csv(
        os.path.join(root, filename)
    )

Raw packet dumps. We're not going to use this for much, but it's useful to have around

In [78]:
nb15 = []

for x in range(1, 5):
    nb15.append(
        read_UNSW_NB15(f"UNSW-NB15_{x}.csv")
    )

print(nb15)

  return pd.read_csv(
  return pd.read_csv(


[        59.166.0.0   1390  149.171.126.6     53  udp  CON  0.001055    132  \
0       59.166.0.0  33661  149.171.126.9   1024  udp  CON  0.036133    528   
1       59.166.0.6   1464  149.171.126.7     53  udp  CON  0.001119    146   
2       59.166.0.5   3593  149.171.126.5     53  udp  CON  0.001209    132   
3       59.166.0.3  49664  149.171.126.0     53  udp  CON  0.001169    146   
4       59.166.0.0  32119  149.171.126.9    111  udp  CON  0.078339    568   
...            ...    ...            ...    ...  ...  ...       ...    ...   
699995  59.166.0.8  12520  149.171.126.6  31010  tcp  FIN  0.020383    320   
699996  59.166.0.0  18895  149.171.126.9     80  tcp  FIN  1.402957  19410   
699997  59.166.0.0  30103  149.171.126.5   5190  tcp  FIN  0.007108   2158   
699998  59.166.0.6  30388  149.171.126.5    111  udp  CON  0.004435    568   
699999  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974   4238   

            164  31  ...  0.17   3   7  1  3.1  1.1  1.2  1.3 

Metadata files

In [79]:
list_events = read_UNSW_NB15("UNSW-NB15_LIST_EVENTS.csv")
features = pd.read_csv(os.path.join(root, "NUSW-NB15_features.csv"), encoding='cp1252')

Finally, our training sets

In [80]:
train = read_UNSW_NB15("UNSW_NB15_training-set.csv")
test = read_UNSW_NB15("UNSW_NB15_testing-set.csv")

Exploration based on [UNSW_NB15 notebook on kaggle](https://www.kaggle.com/code/melekbadreddine/unsw-nb15)

In [81]:
train.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0
2,3,5e-06,udp,-,INT,2,0,1068,0,200000.0051,...,1,3,0,0,0,1,3,0,Normal,0
3,4,6e-06,udp,-,INT,2,0,900,0,166666.6608,...,1,3,0,0,0,2,3,0,Normal,0
4,5,1e-05,udp,-,INT,2,0,2126,0,100000.0025,...,1,3,0,0,0,2,3,0,Normal,0


In [82]:
test.head()

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0


In [83]:
nb15[0].head()

Unnamed: 0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0.17,3,7,1,3.1,1.1,1.2,1.3,Unnamed: 47,0.18
0,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,,0
1,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,,0
2,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,,0
3,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,,0
4,59.166.0.0,32119,149.171.126.9,111,udp,CON,0.078339,568,312,31,...,0,2,4,2,3,1,1,2,,0


In [84]:
nb15[1].head()

Unnamed: 0,59.166.0.0,6055,149.171.126.5,54145,tcp,FIN,0.072974,4238,60788,31,...,0.6,13,13.1,6,7.1,1,1.1,2,Unnamed: 47,0.7
0,59.166.0.0,7832,149.171.126.3,5607,tcp,FIN,0.144951,5174,91072,31,...,0,13,13,6,7,1,1,2,,0
1,59.166.0.8,11397,149.171.126.6,21,tcp,FIN,0.116107,2934,3742,31,...,1,1,2,7,5,1,1,4,,0
2,59.166.0.0,3804,149.171.126.3,53,udp,CON,0.000986,146,178,31,...,0,13,13,6,7,1,1,2,,0
3,59.166.0.8,14339,149.171.126.6,14724,tcp,FIN,0.03848,8928,320,31,...,0,8,20,7,5,1,1,4,,0
4,59.166.0.8,39094,149.171.126.3,53,udp,CON,0.001026,130,162,31,...,0,8,13,6,5,1,1,1,,0


In [85]:
nb15[2].head()

Unnamed: 0,59.166.0.1,18247,149.171.126.4,7662,tcp,FIN,0.119596,4550,68342,31,...,Unnamed: 12,6,2,2.1,5,1,1.1,2.2,Unnamed: 47,0.4
0,59.166.0.3,54771,149.171.126.2,27709,tcp,FIN,0.650574,8928,320,31,...,,3,5,2,4,1,1,4,,0
1,59.166.0.8,13289,149.171.126.9,5190,tcp,FIN,0.00798,2158,2464,31,...,,3,5,1,1,1,1,3,,0
2,149.171.126.18,1043,175.45.176.3,53,udp,INT,5e-06,264,0,60,...,,19,19,19,19,19,19,19,,0
3,149.171.126.18,1043,175.45.176.3,53,udp,INT,5e-06,264,0,60,...,,19,19,19,19,19,19,19,,0
4,59.166.0.3,10275,149.171.126.0,25,tcp,FIN,0.486578,37462,3380,31,...,,3,2,3,4,2,1,2,,0


In [86]:
nb15[3].head()

Unnamed: 0,59.166.0.9,7045,149.171.126.7,25,tcp,FIN,0.201886,37552,3380,31,...,Unnamed: 12,2,2.1,7,4,1,1.1,3,Unnamed: 47,0.4
0,59.166.0.9,9685,149.171.126.2,80,tcp,FIN,5.864748,19410,1087890,31,...,,3,1,4,4,1,1,1,,0
1,59.166.0.2,1421,149.171.126.4,53,udp,CON,0.001391,146,178,31,...,,3,5,2,7,1,1,4,,0
2,59.166.0.2,21553,149.171.126.2,25,tcp,FIN,0.053948,37812,3380,31,...,,1,1,4,7,1,1,3,,0
3,59.166.0.8,45212,149.171.126.4,53,udp,CON,0.000953,146,178,31,...,,2,5,2,1,1,1,2,,0
4,59.166.0.0,59922,149.171.126.8,6881,tcp,FIN,8.633186,25056,1094788,31,...,,9,7,2,3,2,1,6,,0


In [87]:
list_events.head()

Unnamed: 0,Attack category,Attack subcategory,Number of events
0,normal,,2218761
1,Fuzzers,FTP,558
2,Fuzzers,HTTP,1497
3,Fuzzers,RIP,3550
4,Fuzzers,SMB,5245


In [88]:
features

Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,source IP address
1,2,sport,integer,source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,source to destination time to live value


In [89]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 82332 non-null  int64  
 1   dur                82332 non-null  float64
 2   proto              82332 non-null  object 
 3   service            82332 non-null  object 
 4   state              82332 non-null  object 
 5   spkts              82332 non-null  int64  
 6   dpkts              82332 non-null  int64  
 7   sbytes             82332 non-null  int64  
 8   dbytes             82332 non-null  int64  
 9   rate               82332 non-null  float64
 10  sttl               82332 non-null  int64  
 11  dttl               82332 non-null  int64  
 12  sload              82332 non-null  float64
 13  dload              82332 non-null  float64
 14  sloss              82332 non-null  int64  
 15  dloss              82332 non-null  int64  
 16  sinpkt             823

In [90]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82332 entries, 0 to 82331
Data columns (total 45 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 82332 non-null  int64  
 1   dur                82332 non-null  float64
 2   proto              82332 non-null  object 
 3   service            82332 non-null  object 
 4   state              82332 non-null  object 
 5   spkts              82332 non-null  int64  
 6   dpkts              82332 non-null  int64  
 7   sbytes             82332 non-null  int64  
 8   dbytes             82332 non-null  int64  
 9   rate               82332 non-null  float64
 10  sttl               82332 non-null  int64  
 11  dttl               82332 non-null  int64  
 12  sload              82332 non-null  float64
 13  dload              82332 non-null  float64
 14  sloss              82332 non-null  int64  
 15  dloss              82332 non-null  int64  
 16  sinpkt             823

In [91]:
train.describe()

Unnamed: 0,id,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,...,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0,82332.0
mean,41166.5,1.006756,18.666472,17.545936,7993.908,13233.79,82410.89,180.967667,95.713003,64549020.0,...,4.928898,3.663011,7.45636,0.008284,0.008381,0.129743,6.46836,9.164262,0.011126,0.5506
std,23767.345519,4.710444,133.916353,115.574086,171642.3,151471.5,148620.4,101.513358,116.667722,179861800.0,...,8.389545,5.915386,11.415191,0.091171,0.092485,0.638683,8.543927,11.121413,0.104891,0.497436
min,1.0,0.0,1.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,20583.75,8e-06,2.0,0.0,114.0,0.0,28.60611,62.0,0.0,11202.47,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
50%,41166.5,0.014138,6.0,2.0,534.0,178.0,2650.177,254.0,29.0,577003.2,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,0.0,1.0
75%,61749.25,0.71936,12.0,10.0,1280.0,956.0,111111.1,254.0,252.0,65142860.0,...,4.0,3.0,6.0,0.0,0.0,0.0,7.0,11.0,0.0,1.0
max,82332.0,59.999989,10646.0,11018.0,14355770.0,14657530.0,1000000.0,255.0,253.0,5268000000.0,...,59.0,38.0,63.0,2.0,2.0,16.0,60.0,62.0,1.0,1.0


We're going to use attack_cat as our target class

In [92]:
print(train["attack_cat"].value_counts(normalize=True))
categories = list(train["attack_cat"].unique())

attack_cat
Normal            0.449400
Generic           0.229206
Exploits          0.135209
Fuzzers           0.073629
DoS               0.049665
Reconnaissance    0.042462
Analysis          0.008223
Backdoor          0.007081
Shellcode         0.004591
Worms             0.000534
Name: proportion, dtype: float64


9 of these categories are attacks, while one represents normal traffic.
Looks like normal traffic is only 31% of this dataset, followed by generic attacks.

In [93]:
X_train = train.drop(columns=["attack_cat", "label"])
y_train = train["attack_cat"].copy()

X_test = test.drop(columns=["attack_cat", "label"])
y_test = test["attack_cat"].copy()

# Create Pipeline

Get attributes. See `features` for explainations of each attribute.

Note that `features` has some mislabeled and missing fields. I spent quite a lot of time debugging.

In [112]:
feature_names = set(X_train)
for name in feature_names:
    print(name)

spkts
is_ftp_login
dbytes
dur
dttl
sbytes
synack
swin
sjit
ct_src_ltm
stcpb
dwin
service
dpkts
ct_ftp_cmd
djit
response_body_len
ackdat
trans_depth
dinpkt
sload
dmean
dtcpb
rate
sinpkt
dloss
ct_dst_src_ltm
dload
smean
ct_state_ttl
ct_dst_sport_ltm
ct_dst_ltm
is_sm_ips_ports
sttl
ct_src_dport_ltm
ct_flw_http_mthd
sloss
ct_srv_dst
state
proto
tcprtt
id
ct_srv_src


In [113]:
cat_attribs = [
    "is_ftp_login",
    "proto",
    "state",
    "service",
    "swin",
    "dwin",
    "id",
    "proto"
]
print(cat_attribs)

['is_ftp_login', 'proto', 'state', 'service', 'swin', 'dwin', 'id', 'proto']


All other features are numeric

In [114]:
num_attribs = list(feature_names.difference(set(cat_attribs)))

print(num_attribs)

['spkts', 'dbytes', 'dur', 'dttl', 'sbytes', 'synack', 'sjit', 'ct_src_ltm', 'stcpb', 'dpkts', 'ct_ftp_cmd', 'djit', 'response_body_len', 'ackdat', 'trans_depth', 'dinpkt', 'sload', 'dmean', 'dtcpb', 'rate', 'sinpkt', 'dloss', 'ct_dst_src_ltm', 'dload', 'smean', 'ct_state_ttl', 'ct_dst_sport_ltm', 'ct_dst_ltm', 'is_sm_ips_ports', 'sttl', 'ct_src_dport_ltm', 'ct_flw_http_mthd', 'sloss', 'ct_srv_dst', 'tcprtt', 'ct_srv_src']


Final double-check to make sure we got everything

In [115]:
features_gotten = set(num_attribs).union(set(cat_attribs))
features_gotten == feature_names

True

In [116]:

num_pipeline = Pipeline([
    ('scaler', StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("obj", OneHotEncoder(), cat_attribs),
])

X_train_prepared = full_pipeline.fit_transform(X_train)

# Create DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier

max_leaf_nodes_args = list(range(50,101,5))
# See if unrestricted leaf nodes works better
max_leaf_nodes_args.append(None)

param_grid = [{
    "criterion": ['gini', 'entropy', 'log_loss'],
    "max_depth": list(range(1,60,10)),
    "min_samples_split": list(range(2,60,10)),
    "min_samples_leaf": list(range(1,60,10)),
    "max_leaf_nodes": max_leaf_nodes_args
}]

classifier = DecisionTreeClassifier(
    random_state=random_state,
)

grid_search_logistic_regression = GridSearchCV(classifier, param_grid, cv=5,
                           scoring='accuracy', n_jobs=-1)

grid_search_logistic_regression.fit(X_train_prepared, y_train)

# Create CNN

In [122]:
model = models.Sequential([
    layers.Input(shape=(43,)), # TODO: Find shape
    layers.Conv1D(32, 6),
    layers.Conv1D(32, 6),
    layers.MaxPooling1D(pool_size=2),
    layers.Dropout(0.2),
    layers.Conv1D(32, 6),
    layers.Conv1D(32, 6),
    layers.MaxPooling1D(pool_size=2),
    layers.Dropout(0.2),
    layers.Conv1D(32, 6),
    layers.Conv1D(32, 6),
    layers.MaxPooling1D(pool_size=2),
    layers.Dropout(0.4),
    layers.Conv1D(32, 6),
    layers.Dense(22),
    layers.MaxPooling1D(pool_size=2),
    layers.Dense(10, activation="softmax"), # output layer
])

I0000 00:00:1733252881.592804 3073973 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733252883.610602 3073973 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733252883.611343 3073973 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1733252883.633455 3073973 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

ValueError: Input 0 of layer "conv1d_1" is incompatible with the layer: expected min_ndim=3, found ndim=2. Full shape received: (None, 43)

In [None]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss=keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])