In [65]:
from pandas import read_parquet, get_dummies, DataFrame, Series
from numpy import int8
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import LabelEncoder
from matplotlib.pyplot import barh, show, figure
from sklearn.discriminant_analysis import StandardScaler
import numpy as np
from os import getcwd
from os.path import exists, join, isfile
import pickle as pl

In [66]:
train = read_parquet('./dataset/UNSW_NB15_training-set.parquet')
test = read_parquet('./dataset/UNSW_NB15_testing-set.parquet')
rfecv_file=join(getcwd(), 'rfecv.pickle')
# train.head()

In [67]:
# train.info()

# Check value counts in the Service column especially for '-' #

In [68]:
# train['service'].value_counts()

# List all columns with '-'

In [69]:
# for col in train.columns:
#     for row in train[col]:
#         if(row == '-'):
#             print(col)
#             break

# Feature selection

In [70]:
train_dummy=get_dummies(train, prefix='service', prefix_sep='_', columns=['service'], drop_first=True, dtype=int8).drop(['dur','label'], axis=1)
test=get_dummies(test, prefix='service', prefix_sep='_', columns=['service'], drop_first=True, dtype=int8).drop(['dur','label','attack_cat'], axis=1)
# train_dummy

In [71]:
# Unencoded target 'attack_cat'
unencoded_y=train_dummy['attack_cat']

Encode category columns

In [72]:
label_encoder=LabelEncoder()
for cat_col in train_dummy.columns:
    train_dummy[cat_col]=label_encoder.fit_transform(train_dummy[cat_col])

# Encoded target
encoded_y=train_dummy['attack_cat']
# train_dummy

Use 3% of data (The classifier seems to be taking more space than expected)

In [73]:
X=train_dummy.drop('attack_cat', axis=1).sample(n=int(0.5 * len(train_dummy)), random_state=0)
y=train_dummy['attack_cat'][X.index]

# X_size = int(0.03 * len(X))
# X = X.sample(n=int(0.03 * len(X)), random_state=42)
# y = y[X.index]

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_fes=RandomForestClassifier(random_state=0)
rf_fes.fit(X_train, y_train)

In [76]:
# list(zip(X_train.columns,rf_fes.feature_importances_))

In [77]:
# figure(figsize=(6,12))
# barh(X_train.columns,rf_fes.feature_importances_)
# show()
# dt=None

In [78]:

if(exists(rfecv_file) and isfile(rfecv_file)):
    with open(rfecv_file, 'rb') as rfe_cv:
        rfe=pl.load(rfe_cv)
else:
    rfe=RFECV(rf_fes,cv=5,scoring='accuracy')
    rfe.fit(X_train, y_train)
    with open(rfecv_file, 'wb') as rfe_cv:
        pl.dump(rfe, rfe_cv)

In [79]:
# with open(rfecv_file, 'wb') as rfe_cv:
#         pl.dump(rfe, rfe_cv)

In [80]:
# rfe=RFECV(rf_fes,cv=5,scoring='accuracy')
# rfe.fit(X_train, y_train)

In [81]:
# rfe.get_support()

In [82]:
feature_selected=[col_name for col_name, col_vec in list(zip(X_train.columns, rfe.get_support())) if(col_vec == True)]
new_X=X_train[feature_selected]

# Actual prediction

In [None]:
rf_actual=RandomForestClassifier(random_state=0)
rf_actual.fit(new_X, y_train)

Prediction using the test dataset

In [84]:
# test.head()
for test_col in test.columns:
    test[test_col]=label_encoder.fit_transform(test[test_col])
test=test[feature_selected]

In [None]:
y_pred=rf_actual.predict(test)
len(y_pred)

# Mapped y 'attack_cat' which is the target

In [86]:
map_y=Series(list(zip(unencoded_y, encoded_y)), name='mapped_y')
mapped_y=[cat_list for cat_list, cat_count in map_y.value_counts().sort_index().items()]

# Interpret y_pred to attack category

In [None]:
y_pred_attack_cat=[]
for y_pred_i in y_pred:
    # print(y_pred_i)
    for cat, enc in mapped_y:
        if(y_pred_i == enc):
            y_pred_attack_cat.append(cat)
            break
y_pred_attack_cat

In [97]:
# y_test.info()
y_pred[0]
# accuracy_score(y_test, Series(y_pred, name='y_pred'))

np.int64(6)