In [18]:
from pandas import read_parquet, get_dummies, DataFrame, Series
from numpy import int8
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import numpy as np
import os
from os.path import join, exists, isfile
import pickle as pl

In [19]:
train = read_parquet('./dataset/UNSW_NB15_training-set.parquet')
test = read_parquet('./dataset/UNSW_NB15_testing-set.parquet')
rfecv_file=join(os.getcwd(), 'rfecv.pickle')
# head=test.head(10)
# head.to_excel('test_head.xlsx')
# rfecv_file

In [20]:
# train.info()

# Check value counts in the Service column especially for '-' #

In [None]:
# train['service'].value_counts()
# train.shape

# List all columns with '-'

In [22]:
# for col in train.columns:
#     for row in train[col]:
#         if(row == '-'):
#             print(col)
#             break

# Feature selection

In [23]:
train_dummy=get_dummies(train, prefix='service', prefix_sep='_', columns=['service'], drop_first=True, dtype=int8).drop(['dur','label'], axis=1)
test=get_dummies(test, prefix='service', prefix_sep='_', columns=['service'], drop_first=True, dtype=int8).drop(['dur','label','attack_cat'], axis=1)
# train_dummy

In [24]:
# Unencoded target 'attack_cat'
unencoded_y=train_dummy['attack_cat']

Encode category columns

In [25]:
label_encoder=LabelEncoder()
for cat_col in train_dummy.columns:
    # print(cat_col)
    train_dummy[cat_col]=label_encoder.fit_transform(train_dummy[cat_col])

# Encoded target
encoded_y=train_dummy['attack_cat']
# train_dummy

Use 3% of data (The classifier seems to be taking more space than expected)

In [26]:
X=train_dummy.drop('attack_cat', axis=1).sample(n=int(0.5 * len(train_dummy)), random_state=0)
y=train_dummy['attack_cat'][X.index]

# X_size = int(0.03 * len(X))
# X = X.sample(n=int(0.03 * len(X)), random_state=42)
# y = y[X.index]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
if(exists(rfecv_file) == False):
    rf_fes=RandomForestClassifier(random_state=0)
    rf_fes.fit(X_train, y_train)

In [29]:
# rf_fes.feature_importances_
# X_train.columns

In [34]:
list(zip(X_train.columns,rf_fes.feature_importances_))

[('proto', np.float64(0.03538646568956067)),
 ('state', np.float64(0.009841244819194359)),
 ('spkts', np.float64(0.014335661496643289)),
 ('dpkts', np.float64(0.024797685243152484)),
 ('sbytes', np.float64(0.10630599030986654)),
 ('dbytes', np.float64(0.03815481830352469)),
 ('rate', np.float64(0.05392286653394353)),
 ('sload', np.float64(0.054996046636971406)),
 ('dload', np.float64(0.03754363414485831)),
 ('sloss', np.float64(0.013244220997911357)),
 ('dloss', np.float64(0.012288048777488748)),
 ('sinpkt', np.float64(0.029591223426361658)),
 ('dinpkt', np.float64(0.031038709963212854)),
 ('sjit', np.float64(0.02868280496392378)),
 ('djit', np.float64(0.016414128991125262)),
 ('swin', np.float64(0.002502770111004023)),
 ('stcpb', np.float64(0.010878994086082065)),
 ('dtcpb', np.float64(0.011379004395401213)),
 ('dwin', np.float64(0.0003174331664460961)),
 ('tcprtt', np.float64(0.041652333710589094)),
 ('synack', np.float64(0.03686961785807361)),
 ('ackdat', np.float64(0.03772083447159

In [31]:
# figure(figsize=(6,12))
# barh(X_train.columns,rf_fes.feature_importances_)
# show()
# rfecv_file

In [None]:

if(exists(rfecv_file) and isfile(rfecv_file)):
    with open(rfecv_file, 'rb') as rfe_cv:
        # print(rfe_cv)
        rfe=pl.load(rfe_cv)
        # pass
else:
    rfe=RFECV(rf_fes,cv=5,scoring='accuracy')
    rfe.fit(X_train, y_train)
    with open(rfecv_file, 'wb') as rfe_cv:
        pl.dump(rfe, rfe_cv)

In [None]:
with open(rfecv_file, 'wb') as rfe_cv:
        pl.dump(rfe, rfe_cv)

In [16]:
# rfe=RFECV(rf_fes,cv=5,scoring='accuracy')
# rfe.fit(X_train, y_train)

In [17]:
# rfe.get_support()

In [18]:
# feature_selected=[]
# for col_name, col_vec in list(zip(X_train.columns, rfe.get_support())):
#     if(col_vec == True):
#         feature_selected.append(col_name)

feature_selected=[col_name for col_name, col_vec in list(zip(X_train.columns, rfe.get_support())) if(col_vec == True)]
new_X=X_train[feature_selected]

# Actual prediction

In [None]:
rf_actual=RandomForestClassifier(random_state=0)
rf_actual.fit(new_X, y_train)

Prediction using the test dataset

In [20]:
# test.head()
for test_col in test.columns:
    test[test_col]=label_encoder.fit_transform(test[test_col])
test=test[feature_selected]

In [None]:
y_pred=rf_actual.predict(test)
len(y_pred)

# Mapped y 'attack_cat' which is the target

In [22]:
map_y=Series(list(zip(unencoded_y, encoded_y)), name='mapped_y')
mapped_y=[cat_list for cat_list, cat_count in map_y.value_counts().sort_index().items()]

# Interpret y_pred to attack category

In [None]:
y_pred_attack_cat=[]
for y_pred_i in y_pred:
    # print(y_pred_i)
    for cat, enc in mapped_y:
        if(y_pred_i == enc):
            y_pred_attack_cat.append(cat)
            break
# y_pred_attack_cat
Y_pred_att_cat=Series(y_pred_attack_cat, name='y_pred_attack_category')
X_plt=[cat_list[0] for cat_list in list(Y_pred_att_cat.value_counts().sort_values().items())]
y_plt=[cat_list[1] for cat_list in list(Y_pred_att_cat.value_counts().sort_values().items())]

list(Y_pred_att_cat.value_counts().sort_values().items())

In [None]:
plt.figure(figsize=(9,7))

plt.plot(X_plt, y_plt, marker='d', ms=5)
plt.tight_layout()
plt.show()