### Author:  Sydney M. Kasongo

#### The model presented in this uses GRUs to classify intrusion detectin data (NSL-KDD Dataset)
#### The ML model was built using the Keras framework https://keras.io/ (Using TensorFlow as backend)

### Import all the necessary libraries and functions

In [None]:
from __future__ import print_function
import math
import tme
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import numpy as np
from numpy import array
import pandas as pd
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from mlxtend.evaluate import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
#IMPORT KERAS LIBRARIES
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU
from keras.utils import np_utils
import keras.layers
from keras import metrics
import keras_metrics as km
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

### Import the NSL-KDD Dataset and conducts all the necessary mappings

In [None]:
nslkdd_train = pd.read_csv('C:/Users/sydne/Documents/Sydney/UJ/datasets/NSLKDD/KDDTrain+.txt', sep = ",", header=None)
nslkdd_test  = pd.read_csv('C:/Users/sydne/Documents/Sydney/UJ/datasets/NSLKDD/KDDTest+.txt', sep = ",", header=None)
columns = ["duration","protocol_type","service","flag","src_bytes","dst_bytes",
                             "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
                             "num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells",
                             "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count",
                             "serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
                             "srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
                             "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
                             "dst_host_rerror_rate","dst_host_srv_rerror_rate","class", "unknown"]

nslkdd_train.columns = columns
nslkdd_test.columns = columns

nslkdd_train = nslkdd_train[columns[0:42]]
nslkdd_test = nslkdd_test[columns[0:42]]


attack_types_dict = {
        'back': 'dos',
        'buffer_overflow': 'u2r',
        'ftp_write': 'r2l',
        'guess_passwd': 'r2l',
        'imap': 'r2l',
        'ipsweep': 'probe',
        'land': 'dos',
        'loadmodule': 'u2r',
        'multihop': 'r2l',
        'neptune': 'dos',
        'nmap': 'probe',
        'perl': 'u2r',
        'phf':'r2l',
        'pod':'dos',
        'portsweep': 'probe',
        'rootkit': 'u2r',
        'satan': 'probe',
        'smurf': 'dos',
        'spy':'r2l',
        'teardrop': 'dos',
        'warezclient': 'r2l',
        'warezmaster':'r2l',
        'normal': 'normal'
}

# Multiclass Classification
class_mapping = {
    "r2l" : 1,
    "u2r" : 2,
    "probe" : 3,
    "dos" : 4,
    "normal" : 0
}

#Binary Classification
# class_mapping = {
#     "r2l" : 1,
#     "u2r" : 1,
#     "probe" : 1,
#     "dos" : 1,
#     "normal" : 0
# }

#step 1 mapping
nslkdd_train["class"] = nslkdd_train["class"].map(attack_types_dict)
nslkdd_test["class"] =  nslkdd_test["class"].map(attack_types_dict)

#setp 2 mapping
nslkdd_train["class"] = nslkdd_train["class"].map(class_mapping)
nslkdd_test["class"] = nslkdd_test["class"].map(class_mapping)


### Instantiate a Label Encoder and encode categorical variables

In [None]:
# protocol_type , flag and service are catergorical variables - Use label encoding
label_encoder = LabelEncoder()

# Assign numerical values and store them in other columns
nslkdd_train['protocol_type_encoded'] = label_encoder.fit_transform(nslkdd_train['protocol_type'])
nslkdd_train['flag_encoded'] = label_encoder.fit_transform(nslkdd_train['flag'])
nslkdd_train['service_encoded'] = label_encoder.fit_transform(nslkdd_train['service'])

nslkdd_test['protocol_type_encoded'] = label_encoder.fit_transform(nslkdd_test['protocol_type'])
nslkdd_test['flag_encoded'] = label_encoder.fit_transform(nslkdd_test['flag'])
nslkdd_test['service_encoded'] = label_encoder.fit_transform(nslkdd_test['service'])

### Clean up the dataset

In [None]:
X_nslkdd_train = nslkdd_train.drop(['flag','service','protocol_type', 'class'], axis=1)
y_nslkdd_train = nslkdd_train['class']

X_test = nslkdd_test.drop(['flag','service','protocol_type', 'class'], axis=1)
y_test = nslkdd_test['class']
y_test = y_test.replace([np.inf, -np.inf], np.nan).fillna(0)

##### The reduced feature vector reduced_feature_vector was  generated using the XGBoost classifier in another notebook.

#### In order to scale the data, there are two options:####
 1. use the built-in  MinMaxScaler()
 2. use custom built function scale_series() [Author: Dr Sydney Kasongo]


In [None]:

reduced_feature_vector = ['src_bytes',
                          'same_srv_rate',
                          'dst_host_serror_rate',
                          'dst_host_diff_srv_rate',
                          'dst_host_srv_diff_host_rate',
                          'count',
                          'protocol_type_encoded',
                          'diff_srv_rate',
                          'dst_bytes',
                          'dst_host_srv_count',
                          'serror_rate',
                          'wrong_fragment',
                          'num_compromised',
                          'service_encoded',
                          'dst_host_same_src_port_rate',
                          'hot',
                          'num_file_creations',
                          'dst_host_rerror_rate',
                          'dst_host_count',
                          'logged_in',
                          'dst_host_same_srv_rate',
                          'flag_encoded']

X_nslkdd_train = X_nslkdd_train[reduced_feature_vector]
X_test = X_test[reduced_feature_vector]

#Generate the train and validation datasets.
X_train, X_val, y_train, y_val = train_test_split(X_nslkdd_train, y_nslkdd_train, random_state=10)

#Scaling 
scaler_train = MinMaxScaler()
scaler_val = MinMaxScaler()
scaler_test = MinMaxScaler()

#Log Scaling
def log_normalize(series):
    return series.apply(lambda x:math.log(x+1.0))

def scale_series(df, col_names):
    processed_inputs = pd.DataFrame()
    for col in col_names:
        processed_inputs[col] =  log_normalize(df[col])
    return df

X_train = array(scale_series(X_train, reduced_feature_vector)).reshape(94479,1,22)
X_val = array(scale_series(X_val, reduced_feature_vector)).reshape(31494,1,22)
X_test = array(scale_series(X_test, reduced_feature_vector)).reshape(22544,1,22)

y_train = np_utils.to_categorical(y_train)
y_val = np_utils.to_categorical(y_val)
y_test = np_utils.to_categorical(y_test)

#### Build the model and setup the hyperparameters

In [None]:
#Creating the model
dlstm_model = Sequential()
dlstm_model.add(GRU(50,return_sequences=True, input_shape=(1,22)))
dlstm_model.add(GRU(50,return_sequences=True))
dlstm_model.add(GRU(50))
dlstm_model.add(Dense(5, activation='softmax'))

#### Compile the model

In [None]:
# Compiling model
dlstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',km.binary_precision(),km.binary_recall()]) #categorical_crossentropy mean_squared_error optimizer='adam'

#### Train the model

In [None]:
#start time of training
start = time.time()
# Training a model / Fit Model
dlstm_neural_net_model= dlstm_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=500, verbose=0)
#End time of training
end = time.time()
#Training Time
delay = end - start

#### Evaluate the model

In [None]:
#validation Scores
dlstm_validation_scores = dlstm_model.evaluate(X_val, y_val)
#Test Scores
dlstm_test_scores = dlstm_model.evaluate(X_test, y_test)

#Compute f1_score - F Measure  
precision = dlstm_validation_scores[2] * 100
recall = dlstm_validation_scores[3] * 100

f1score = (2 * precision * recall)/(precision + recall)

print("LTSM Binary")
print("\n DLSTM Test Accuracy: %.2f%%" % (dlstm_test_scores[1]*100))
print("Training Time "+ str(delay))


y_train_pred = dlstm_model.predict(X_train)
y_test_pred = dlstm_model.predict(X_test)
y_val_pred = dlstm_model.predict(X_val)


cm_test = confusion_matrix(y_test.argmax(axis=1), y_test_pred.argmax(axis=1))


cm_val = confusion_matrix(y_val.argmax(axis=1), y_val_pred.argmax(axis=1))
cm_train = confusion_matrix(y_train.argmax(axis=1), y_train_pred.argmax(axis=1))

fig, ax = plot_confusion_matrix(conf_mat=cm_test)
plt.show()

fig_1, ax_1 = plot_confusion_matrix(conf_mat=cm_val)
plt.show()

fig_2, ax_2 = plot_confusion_matrix(conf_mat=cm_train)
plt.show()