In [106]:
from pandas import read_parquet, get_dummies, DataFrame, Series
from numpy import int8
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, f1_score, precision_recall_curve, precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from os.path import join, exists, isfile
import pickle as pl


In [74]:
train = read_parquet('./dataset/UNSW_NB15_training-set.parquet')
test = read_parquet('./dataset/UNSW_NB15_testing-set.parquet')
rfecv_file=join(os.getcwd(), 'pickles/rfecv.pickle')
intrusion_model=join(os.getcwd(), 'pickles/intrusion_model.pickle')
attack_map=join(os.getcwd(), 'pickles/attack_map.pickle')
# train_heat=DataFrame({})
# head=test.head(10)
# head.to_excel('test_head.xlsx')
# rfecv_file

In [75]:
# train.isnull().sum()
# train.info()

In [76]:
# train.info()
# plt.figure(figsize=(10, 6))
# sns.boxplot(data=train, x='attack_cat')
# plt.tight_layout()
# plt.show()

In [77]:
# train.columns
# train.info()
# boxsel=[col for col in train.columns if train[col].dtype=='category']
# train[boxsel]

In [78]:
# label_encoder=LabelEncoder()
# for cat_col in train.columns:
#     # print(cat_col)
#     train_heat[cat_col]=label_encoder.fit_transform(train[cat_col])

In [79]:
# plt.figure(figsize=(20,18))
# plt.rcParams.update({'font.size': 10})
# sns.heatmap(train_heat.corr(), cmap = 'coolwarm', vmin = -1, vmax = 1, center = 0, annot=True, fmt=".2f", square=True, linewidths=.5)
# plt.title('Network Intrusion Detection Heatmap')
# plt.grid(True)
# plt.tight_layout()
# plt.savefig('heatmap.png')
# plt.show()
# Check value counts in the Service column especially for '-' #

In [80]:
# train['service'].value_counts()
# train.shape

# List all columns with '-'

In [81]:
# for col in train.columns:
#     for row in train[col]:
#         if(row == '-'):
#             print(col)
#             break

# Feature selection

In [82]:
train_dummy=get_dummies(train, prefix='service', prefix_sep='_', columns=['service'], drop_first=True, dtype=int8).drop(['dur','label'], axis=1)
test=get_dummies(test, prefix='service', prefix_sep='_', columns=['service'], drop_first=True, dtype=int8).drop(['dur','label','attack_cat'], axis=1)
# train_dummy

In [83]:
# Unencoded target 'attack_cat'
unencoded_y=train_dummy['attack_cat']

Encode category columns

In [84]:
label_encoder=LabelEncoder()
for cat_col in train_dummy.columns:
    # print(cat_col)
    train_dummy[cat_col]=label_encoder.fit_transform(train_dummy[cat_col])

# Encoded target
encoded_y=train_dummy['attack_cat']
# train_dummy

Use 3% of data (The classifier seems to be taking more space than expected)

In [85]:
X=train_dummy.drop('attack_cat', axis=1).sample(n=int(0.5 * len(train_dummy)), random_state=0)
y=train_dummy['attack_cat'][X.index]

# X_size = int(0.03 * len(X))
# X = X.sample(n=int(0.03 * len(X)), random_state=42)
# y = y[X.index]

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [87]:
if(exists(rfecv_file) == False):
    rf_fes=RandomForestClassifier(random_state=0)
    rf_fes.fit(X_train, y_train)

In [88]:
# rf_fes=RandomForestClassifier(random_state=0)
# rf_fes.fit(X_train, y_train)

In [89]:
# rf_fes.feature_importances_
# X_train.columns

In [90]:
# list(zip(X_train.columns,rf_fes.feature_importances_))

In [91]:
# plt.figure(figsize=(6,12))
# plt.barh(X_train.columns,rf_fes.feature_importances_)
# plt.ylabel('Features')
# plt.xlabel('Feature Importance')
# plt.title('Feature importance for Network Intrusion Detection')
# plt.show()
# rfecv_file

In [92]:

if(exists(rfecv_file) and isfile(rfecv_file)):
    with open(rfecv_file, 'rb') as rfe_cv:
        # print(rfe_cv)
        rfe=pl.load(rfe_cv)
        # pass
else:
    rfe=RFECV(rf_fes,cv=5,scoring='accuracy')
    rfe.fit(X_train, y_train)
    with open(rfecv_file, 'wb') as rfe_cv:
        pl.dump(rfe, rfe_cv)

In [93]:
# with open(rfecv_file, 'wb') as rfe_cv:
#         pl.dump(rfe, rfe_cv)

In [94]:
# rfe=RFECV(rf_fes,cv=5,scoring='accuracy')
# rfe.fit(X_train, y_train)

In [95]:
# rfe.get_support()

In [96]:
# feature_selected=[]
# for col_name, col_vec in list(zip(X_train.columns, rfe.get_support())):
#     if(col_vec == True):
#         feature_selected.append(col_name)

feature_selected=[col_name for col_name, col_vec in list(zip(X_train.columns, rfe.get_support())) if(col_vec == True)]
new_X=X_train[feature_selected]

# Actual prediction

In [97]:
# rf_actual=RandomForestClassifier(random_state=0)
# rf_actual.fit(new_X, y_train)

if(exists(intrusion_model) and isfile(intrusion_model)):
    with open(intrusion_model, 'rb') as i_m:
        # print(rfe_cv)
        rf_actual=pl.load(i_m)
        # pass
else:
    rf_actual=RandomForestClassifier(random_state=0)
    rf_actual.fit(new_X, y_train)
    with open(intrusion_model, 'wb') as i_m:
        pl.dump(rf_actual, i_m)

Prediction using the test dataset

In [98]:
# test.head()
for test_col in test.columns:
    test[test_col]=label_encoder.fit_transform(test[test_col])
test=test[feature_selected]

In [None]:
y_pred=rf_actual.predict(X_test[feature_selected])
y_p_p=rf_actual.predict_proba(X_test[feature_selected])[:, 1]
print('Accuracy Score: ', accuracy_score(y_test, y_pred))
print('F1 Score: ', f1_score(y_test, y_pred, average='weighted'))
# print(roc_auc_score(y_test, y_pred, average='weighted', multi_class='ovo'))
print('Recall Score: ', recall_score(y_test, y_pred, average='weighted'))
print('Precision Score: ', precision_score(y_test, y_pred, average='weighted'))
# print(y_class)


In [None]:
# precision, recall, thresholds = roc_curve(y_test, y_p_p, pos_label=True)
# plt.fill_between(recall, precision)
# plt.ylabel("True Positive Rate")
# plt.xlabel("False Positive Rate")
# plt.grid(True)
# plt.title("Network Intrusion Detection - ROC curve")
# plt.savefig('roc.jpg')

In [None]:

# ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred, labels=rf_actual.classes_), display_labels=rf_actual.classes_).plot()
# plt.savefig('conf_mat.jpg')
# plt.show()

# Mapped y 'attack_cat' which is the target

In [102]:
map_y=Series(list(zip(unencoded_y, encoded_y)), name='mapped_y')
mapped_y=[cat_list for cat_list, cat_count in map_y.value_counts().sort_index().items()]

# Interpret y_pred to attack category

In [103]:
y_pred_attack_cat=[]
# for y_pred_i in y_pred:
#     # print(y_pred_i)
#     for cat, enc in mapped_y:
#         if(y_pred_i == enc):
#             y_pred_attack_cat.append(cat)
#             break

if(exists(attack_map) and isfile(attack_map)):
    with open(attack_map, 'rb') as att_map:
        # print(rfe_cv)
        y_pred_attack_cat=pl.load(att_map)
        # pass
else:
    for y_pred_i in y_pred:
    # print(y_pred_i)
        for cat, enc in mapped_y:
            if(y_pred_i == enc):
                y_pred_attack_cat.append(cat)
                break
    with open(attack_map, 'wb') as att_map:
        pl.dump(y_pred_attack_cat, att_map)
# y_pred_attack_cat


In [104]:
Y_pred_att_cat=Series(y_pred_attack_cat, name='y_pred_attack_category')
X_plt=[cat_list[0] for cat_list in list(Y_pred_att_cat.value_counts().sort_values().items())]
y_plt=[cat_list[1] for cat_list in list(Y_pred_att_cat.value_counts().sort_values().items())]

# list(Y_pred_att_cat.value_counts().sort_values().items())

In [None]:
plt.figure(figsize=(9,7))

plt.plot(X_plt, y_plt, marker='d', ms=5)
plt.tight_layout()
plt.show()