# Training a CNN for DDoS attack detection
In this laboratory, you will train a DL model (LUCID) on a dataset of benign and DDoS network traffic. After 100 epochs, the trained model is saved on the hard disk in [*h5* format](https://www.hdfgroup.org/). The accuracy of the resulting model, as well as the duration of the training process, depends on a range of variables, such as the model's hyper-parameters, the model's layers (fully connected, convolutional), the optimizer (e.g., SGD or Adam) and on the number of training epochs.    

Although all these parameters can be tuned with automated procedures, in this laboratory you will modify them manually to understand the impact of your changes on the training process and output.

In [None]:
# Author: Roberto Doriguzzi-Corin
# Project: Course on Network Intrusion and Anomaly Detection with Machine Learning
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import time
import glob
import argparse
import tensorflow as tf
import numpy as np
import random as rn
import os
import csv
import h5py
import logging

# Seed Random Numbers
SEED = 1
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
rn.seed(SEED)
config = tf.compat.v1.ConfigProto(inter_op_parallelism_threads=1)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.get_logger().setLevel(logging.ERROR)

OUTPUT_FOLDER = "./output/"

from itertools import cycle
from tensorflow.keras.optimizers import Adam,SGD
from tensorflow.keras.layers import Dense, Activation,  Flatten, Conv2D, Input
from tensorflow.keras.layers import  GlobalMaxPooling2D
from tensorflow.keras.models import Model, Sequential, save_model, load_model, clone_model
from sklearn.metrics import f1_score, precision_score, accuracy_score, log_loss, confusion_matrix
from sklearn.utils import shuffle

import tensorflow.keras.backend as K
tf.random.set_seed(SEED)
K.set_image_data_format('channels_last')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
#config.log_device_placement = True  # to log device placement (on which device the operation ran)

In [None]:
def load_dataset(path):
    filename = glob.glob(path)[0]
    dataset = h5py.File(filename, "r")
    set_x_orig = np.array(dataset["set_x"][:])  # features
    set_y_orig = np.array(dataset["set_y"][:])  # labels

    X = np.reshape(set_x_orig, (set_x_orig.shape[0], set_x_orig.shape[1], set_x_orig.shape[2], 1))
    Y = set_y_orig

    return X, Y

## Model design
The architecture of the model consists of one [convolutional layer](https://keras.io/api/layers/convolution_layers/), [max pooling](https://keras.io/api/layers/pooling_layers/global_max_pooling2d/) and a final classification [dense layer](https://keras.io/api/layers/core_layers/dense/), whose output is the probability of the input sample of being a DDoS flow. Please note that the output layer of the neural network consists of only one neuron, whose value is the output of the sigmoid activation function (a real number between 0 and 1).

<img src="../../Content/artworks/ml-workflow.png" width="90%">

In [None]:
# hyperparameters
MAX_EPOCHS=100 # number of rounds of training (start with 100 and then try with e.g., 500) 
LR = 0.01
BATCH_SIZE = 2048
KERNELS = 64 # the "k" parameter in the figure
KERNEL_HEIGHT = 3 # the "h" parameter in the figure

In [None]:
# CNN model
def Conv2DModel(model_name, input_shape,kernels,kernel_rows,kernel_col):
    K.clear_session()

    model = Sequential(name=model_name)
    model.add(Input(shape=input_shape)),
    model.add(Conv2D(kernels, (kernel_rows,kernel_col), strides=(1, 1), name='conv0'))
    model.add(Activation('relu'))
    model.add(GlobalMaxPooling2D())
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid', name='fc1'))

    print(model.summary())
    return model

In [None]:
def compileModel(model,lr):
    #optimizer = SGD(learning_rate=lr, momentum=0.9)
    optimizer = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999)
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=['accuracy'])  # here we specify the loss function

In [None]:
def trainingEpoch(model, batch_size, X_train,Y_train,X_val,Y_val):
    tt0 = time.time()
    history = model.fit(x=X_train, y=Y_train, validation_data=(X_val, Y_val), epochs=1, batch_size=batch_size, verbose=2, callbacks=[])
    tt1 = time.time()

    accuracy_train = history.history['accuracy'][0]
    loss_train = history.history['loss'][0]
    accuracy_val = history.history['val_accuracy'][0]
    loss_val = history.history['val_loss'][0]
    return tt1-tt0, accuracy_train, loss_train , accuracy_val, loss_val

In [None]:
X_train, Y_train = load_dataset("./sample-dataset//*" + '-train.hdf5')
X_val, Y_val = load_dataset("./sample-dataset//*" + '-val.hdf5')

X_train, Y_train = shuffle(X_train, Y_train, random_state=SEED)
X_val, Y_val = shuffle(X_val, Y_val, random_state=SEED)

In [None]:
#CNN Model
model = Conv2DModel("cnn", X_train.shape[1:4], KERNELS,KERNEL_HEIGHT, X_train.shape[2])
compileModel(model,LR)

In [None]:
training_fieldnames = ['Samples', 'Time', 'Accuracy(train)', 'Loss(train)', 'Accuracy(val)', 'Loss(val)']
training_file = open('./training-binary.csv', 'a', newline='')
training_file.truncate(0)  # clean the file content (as we open the file in append mode)
training_writer = csv.DictWriter(training_file, fieldnames=training_fieldnames)
training_writer.writeheader()
training_file.flush()

In [None]:
# Training variables
min_loss = float('inf')
max_acc_val = 0
best_model = None

In [None]:
for epoch in range(MAX_EPOCHS):
    print("Epoch: %d/%s" % (epoch + 1, MAX_EPOCHS))
    ttime, acc_train,loss_train, acc_val, loss_val= trainingEpoch(model, BATCH_SIZE, X_train, Y_train, X_val, Y_val)
    row = {'Samples': Y_train.shape[0], 'Time': '{:10.3f}'.format(ttime), 'Accuracy(train)': acc_train,
           'Loss(train)': loss_train, 'Accuracy(val)': acc_val, 'Loss(val)': loss_val}
    training_writer.writerow(row)

    if acc_val > max_acc_val:
        max_acc_val = acc_val
        best_model_loss_val = loss_val
        best_model = clone_model(model)
        best_model.set_weights(model.get_weights())

In [None]:
if best_model is not None:
    tp0 = time.time()
    Y_pred_val = (best_model.predict(X_val) > 0.5)
    tp1 = time.time()
    Y_true_val = Y_val.reshape((Y_val.shape[0], 1))
    f1_score_val = f1_score(Y_true_val, Y_pred_val)


    try:
        if os.path.isdir(OUTPUT_FOLDER) == False:
            os.mkdir(OUTPUT_FOLDER)
        best_model.save(OUTPUT_FOLDER+'10t-10n-DOS2019-LUCID.keras')
        print("F1 Score of the saved on the validation set: ", f1_score_val)
    except:
        print("An exception occurred when saving the model!")

In [None]:
training_file.close()
print("Training log saved in file: ", training_file.name)