### Cleaning raw UNSW-NB15 PCAP Data

In [29]:
import pandas as pd
from read_unsw import *
pd.set_option('max_colwidth', 200)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.ensemble import RandomForestClassifier

Feature description

In [2]:
feature_desc = pd.read_csv("https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/NUSW-NB15_features.csv",
                          encoding= 'unicode_escape',
                          index_col = False)

In [3]:
feature_desc

Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,"Indicates to the state and its dependent protocol, e.g. ACC, CLO, CON, ECO, ECR, FIN, INT, MAS, PAR, REQ, RST, TST, TXD, URH, URN, and (-) (if not used state)"
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value


Reading raw data

In [4]:
base_url = 'https://www.unsw.adfa.edu.au/unsw-canberra-cyber/cybersecurity/ADFA-NB15-Datasets/UNSW-NB15_'

file_names = [base_url + str(x) + '.csv' for x in range(1, 5)]

In [5]:
unsw_data = pd.concat(map(lambda file: process_pcap(file), file_names), ignore_index=True, axis = 0)

  op = _Concatenator(
  op = _Concatenator(


In [7]:
unsw_data.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
       'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime',
       'Ltime', 'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'Label'],
      dtype='object')

Based off feedback from SME on features that will be easily accessible in the new data set, we will use the following to train a binary classifier (attack/safe):

* srcip: source IP
* sport: source port #
* dstip: destination IP
* dsport: destination port #
* proto: transaction protocol
* state: state and dependent protocol ("-": none used)
* service: service used (http, ftp, etc.)

Can also use:

* sbytes: source to destination transaction bytes
* dbytes: destination to source transaction bytes

### Setup Data

In [46]:
data_1 = unsw_data[['srcip', 'sport','dstip', 'dsport', 'proto', 'state', 'service', 'Label']].astype(str)

Drop observations with null values

In [48]:
data_1.dropna(inplace=True)

Convert features to categorical features

In [49]:
le = LabelEncoder()

In [50]:
data_1 = data_1.apply(le.fit_transform)

In [51]:
x_train, x_test, y_train, y_test = train_test_split(data_1.drop('Label', axis=1), data_1['Label'], test_size=0.30, random_state=35)

### Naive Bayes

In [54]:
cat_nb = CategoricalNB()

In [55]:
class_pred = cat_nb.fit(x_train, y_train).predict(x_test)

In [57]:
print(classification_report(class_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4284

    accuracy                           1.00      4284
   macro avg       1.00      1.00      1.00      4284
weighted avg       1.00      1.00      1.00      4284



### Random Forest

In [60]:
rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt')

In [61]:
pred = rf.fit(x_train, y_train).predict(x_test)

In [62]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4284

    accuracy                           1.00      4284
   macro avg       1.00      1.00      1.00      4284
weighted avg       1.00      1.00      1.00      4284

