# Training a CNN for DDoS attack detection
In questo laboratorio, addestreremo e testeremo una rete neurale per l'individuazione di attacchi di rete di tipo Distributed Denial of Service (DDoS).

<img src="ml-workflow.png" width="90%">

In [1]:
# Author: Roberto Doriguzzi-Corin
# Project: Course on Network Intrusion and Anomaly Detection with Deep Learning
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import time
import glob
import argparse
import tensorflow as tf
import numpy as np
import random as rn
import os
import csv
import h5py
import logging
import pprint
import pyshark

# Seed Random Numbers
SEED = 1
os.environ['PYTHONHASHSEED']=str(SEED)
np.random.seed(SEED)
rn.seed(SEED)
config = tf.compat.v1.ConfigProto(inter_op_parallelism_threads=1)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.get_logger().setLevel(logging.ERROR)

OUTPUT_FOLDER = "./output/"

from itertools import cycle
from tensorflow.keras.optimizers import Adam,SGD
from tensorflow.keras.layers import Dense, Activation,  Flatten, Conv2D
from tensorflow.keras.layers import  GlobalMaxPooling2D
from tensorflow.keras.models import Model, Sequential, save_model, load_model, clone_model
from sklearn.metrics import f1_score, precision_score, accuracy_score, log_loss, confusion_matrix
from sklearn.utils import shuffle

import tensorflow.keras.backend as K
tf.random.set_seed(SEED)
K.set_image_data_format('channels_last')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
#config.log_device_placement = True  # to log device placement (on which device the operation ran)

from lucid_dataset_parser import *
from util_functions import *

In [2]:
def load_dataset(path):
    filename = glob.glob(path)[0]
    dataset = h5py.File(filename, "r")
    set_x_orig = np.array(dataset["set_x"][:])  # features
    set_y_orig = np.array(dataset["set_y"][:])  # labels

    X = np.reshape(set_x_orig, (set_x_orig.shape[0], set_x_orig.shape[1], set_x_orig.shape[2], 1))
    Y = set_y_orig

    return X, Y

## Definizione dei parametri della rete neurale

In [3]:
# hyperparameters
MAX_EPOCHS=100 # number of rounds of training (start with 100 and then try with e.g., 500) 
LR = 0.01
BATCH_SIZE = 2048
KERNELS = 64 # the "k" parameter in the figure
KERNEL_HEIGHT = 3 # the "h" parameter in the figure

In [4]:
# CNN model
def Conv2DModel(model_name, input_shape,kernels,kernel_rows,kernel_col):
    K.clear_session()

    model = Sequential(name=model_name)
    model.add(Conv2D(kernels, (kernel_rows,kernel_col), strides=(1, 1), input_shape=input_shape, name='conv0'))
    model.add(Activation('relu'))
    model.add(GlobalMaxPooling2D())
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid', name='fc1'))

    print(model.summary())
    return model

# Addestramento della rete neurale

In [5]:
def compileModel(model,lr):
    #optimizer = SGD(learning_rate=lr, momentum=0.9)
    optimizer = Adam(learning_rate=lr, beta_1=0.9, beta_2=0.999)
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=['accuracy'])  # here we specify the loss function

In [6]:
def trainingEpoch(model, batch_size, X_train,Y_train,X_val,Y_val):
    tt0 = time.time()
    history = model.fit(x=X_train, y=Y_train, validation_data=(X_val, Y_val), epochs=1, batch_size=batch_size, verbose=2, callbacks=[])
    tt1 = time.time()

    accuracy_train = history.history['accuracy'][0]
    loss_train = history.history['loss'][0]
    accuracy_val = history.history['val_accuracy'][0]
    loss_val = history.history['val_loss'][0]
    return tt1-tt0, accuracy_train, loss_train , accuracy_val, loss_val

In [7]:
X_train, Y_train = load_dataset("./sample-dataset//*" + '-train.hdf5')
X_val, Y_val = load_dataset("./sample-dataset//*" + '-val.hdf5')

X_train, Y_train = shuffle(X_train, Y_train, random_state=SEED)
X_val, Y_val = shuffle(X_val, Y_val, random_state=SEED)

In [8]:
#CNN Model
model = Conv2DModel("cnn", X_train.shape[1:4], KERNELS,KERNEL_HEIGHT, X_train.shape[2])
compileModel(model,LR)

Model: "cnn"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv0 (Conv2D)              (None, 8, 1, 64)          2176      
                                                                 
 activation (Activation)     (None, 8, 1, 64)          0         
                                                                 
 global_max_pooling2d (Globa  (None, 64)               0         
 lMaxPooling2D)                                                  
                                                                 
 flatten (Flatten)           (None, 64)                0         
                                                                 
 fc1 (Dense)                 (None, 1)                 65        
                                                                 
Total params: 2,241
Trainable params: 2,241
Non-trainable params: 0
_____________________________________________________________

In [9]:
training_fieldnames = ['Samples', 'Time', 'Accuracy(train)', 'Loss(train)', 'Accuracy(val)', 'Loss(val)']
training_file = open('./training-binary.csv', 'a', newline='')
training_file.truncate(0)  # clean the file content (as we open the file in append mode)
training_writer = csv.DictWriter(training_file, fieldnames=training_fieldnames)
training_writer.writeheader()
training_file.flush()

In [10]:
# Training variables
min_loss = float('inf')
max_acc_val = 0
best_model = None

In [11]:
for epoch in range(MAX_EPOCHS):
    print("Epoch: %d/%s" % (epoch + 1, MAX_EPOCHS))
    ttime, acc_train,loss_train, acc_val, loss_val= trainingEpoch(model, BATCH_SIZE, X_train, Y_train, X_val, Y_val)
    row = {'Samples': Y_train.shape[0], 'Time': '{:10.3f}'.format(ttime), 'Accuracy(train)': acc_train,
           'Loss(train)': loss_train, 'Accuracy(val)': acc_val, 'Loss(val)': loss_val}
    training_writer.writerow(row)

    if acc_val > max_acc_val:
        max_acc_val = acc_val
        best_model_loss_val = loss_val
        best_model = clone_model(model)
        best_model.set_weights(model.get_weights())

Epoch: 1/100
2/2 - 0s - loss: 0.6880 - accuracy: 0.4614 - val_loss: 0.6716 - val_accuracy: 0.7157 - 284ms/epoch - 142ms/step
Epoch: 2/100
2/2 - 0s - loss: 0.6643 - accuracy: 0.8103 - val_loss: 0.6500 - val_accuracy: 0.9240 - 32ms/epoch - 16ms/step
Epoch: 3/100
2/2 - 0s - loss: 0.6418 - accuracy: 0.9272 - val_loss: 0.6261 - val_accuracy: 0.9167 - 34ms/epoch - 17ms/step
Epoch: 4/100
2/2 - 0s - loss: 0.6169 - accuracy: 0.8624 - val_loss: 0.6011 - val_accuracy: 0.7525 - 33ms/epoch - 17ms/step
Epoch: 5/100
2/2 - 0s - loss: 0.5908 - accuracy: 0.7833 - val_loss: 0.5761 - val_accuracy: 0.7794 - 38ms/epoch - 19ms/step
Epoch: 6/100
2/2 - 0s - loss: 0.5640 - accuracy: 0.8613 - val_loss: 0.5503 - val_accuracy: 0.9142 - 27ms/epoch - 13ms/step
Epoch: 7/100
2/2 - 0s - loss: 0.5361 - accuracy: 0.9229 - val_loss: 0.5250 - val_accuracy: 0.9167 - 28ms/epoch - 14ms/step
Epoch: 8/100
2/2 - 0s - loss: 0.5092 - accuracy: 0.9229 - val_loss: 0.5009 - val_accuracy: 0.9167 - 25ms/epoch - 13ms/step
Epoch: 9/100
2

In [12]:
if best_model is not None:
    tp0 = time.time()
    Y_pred_val = (best_model.predict(X_val) > 0.5)
    tp1 = time.time()
    Y_true_val = Y_val.reshape((Y_val.shape[0], 1))
    f1_score_val = f1_score(Y_true_val, Y_pred_val)


    try:
        if os.path.isdir(OUTPUT_FOLDER) == False:
            os.mkdir(OUTPUT_FOLDER)
        best_model.save(OUTPUT_FOLDER+'nids_model.h5')
        print("F1 Score of the saved on the validation set: ", f1_score_val)
    except:
        print("An exception occurred when saving the model!")

F1 Score of the saved on the validation set:  0.9828009828009828


In [13]:
training_file.close()
print("Training log saved in file: ", training_file.name)

Training log saved in file:  ./training-binary.csv


# Test della rete neurale

In [2]:
def report_results(Y_true, Y_pred, model_name, data_source, prediction_time):
    ddos_rate = '{:04.3f}'.format(sum(Y_pred) / Y_pred.shape[0])

    if Y_true is not None and len(Y_true.shape) > 0:  # if we have the labels, we can compute the classification accuracy
        Y_true = Y_true.reshape((Y_true.shape[0], 1))
        accuracy = accuracy_score(Y_true, Y_pred)

        f1 = f1_score(Y_true, Y_pred)
        tn, fp, fn, tp = confusion_matrix(Y_true, Y_pred, labels=[0, 1]).ravel()
        tnr = tn / (tn + fp)
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        tpr = tp / (tp + fn)

        row = {'Model': model_name, 'Time': '{:04.3f}'.format(prediction_time),
               'Samples': Y_pred.shape[0], 'DDOS%': ddos_rate, 'Accuracy': '{:05.4f}'.format(accuracy), 'F1Score': '{:05.4f}'.format(f1),
               'TPR': '{:05.4f}'.format(tpr), 'FPR': '{:05.4f}'.format(fpr), 'TNR': '{:05.4f}'.format(tnr), 'FNR': '{:05.4f}'.format(fnr), 'Source': data_source}

    pprint.pprint(row, sort_dicts=False)

In [3]:
def predict(dataset_path, model_path):
    if dataset_path is not None:
        X_test, y_test = load_dataset(dataset_path + "/*" + '-test.hdf5')

        if model_path == None or model_path.endswith('.h5') == False:
                print ("No valid model specified!")
                exit(-1)

        if model_path is not None:
            model = load_model(model_path)
        else:
            print ("Invalid model path: ", model_path) 
            return

        pt0 = time.time()
        Y_pred = np.squeeze(model.predict(X_test, batch_size=16) > 0.5,axis=1)
        pt1 = time.time()
        prediction_time = pt1 - pt0

        report_results(np.squeeze(y_test), Y_pred,  model.name, '', prediction_time)

In [4]:
def predict_live(source,model_path):
    if source is not None:
        if source.endswith('.pcap'):
            pcap_file = source
            cap = pyshark.FileCapture(pcap_file)
            data_source = pcap_file.split('/')[-1].strip()
        else:
            cap =  pyshark.LiveCapture(interface=source)
            data_source = args.predict_live

        print ("Prediction on network traffic from: ", source)

        if model_path is not None:
            model = load_model(model_path)
        else:
            print ("Invalid model path: ", model_path) 
            return

        # load the labels, if available
        labels = parse_labels('DOS2019')

        mins, maxs = static_min_max(flatten=False,time_window=10,max_flow_len=10)

        while (True):
            samples = process_live_traffic(cap, 'DOS2019', labels, max_flow_len=10, traffic_type="all")
            if len(samples) > 0:
                X,Y_true,flow_ids = dataset_to_list_of_fragments(samples)
                X_flatten = flatten_samples(X)
                X = np.array(normalize(X_flatten, mins, maxs))
                if labels is not None:
                    Y_true = np.array(Y_true)
                else:
                    Y_true = None
                
                pt0 = time.time()
                Y_pred = np.squeeze(model.predict(X, batch_size=2048) > 0.5,axis=1)
                pt1 = time.time()
                prediction_time = pt1 - pt0

                report_results(np.squeeze(Y_true), Y_pred,  model.name, '', prediction_time)

In [5]:
predict('./sample-dataset/','./output/nids_model.h5')

{'Model': 'cnn',
 'Time': '0.114',
 'Samples': 453,
 'DDOS%': '0.561',
 'Accuracy': '0.9912',
 'F1Score': '0.9922',
 'TPR': '0.9845',
 'FPR': '0.0000',
 'TNR': '1.0000',
 'FNR': '0.0155',
 'Source': ''}


In [6]:
predict_live('./sample-dataset/CIC-DDoS-2019-UDPLag.pcap','./output/nids_model.h5')

Prediction on network traffic from:  ./sample-dataset/CIC-DDoS-2019-UDPLag.pcap


  break


KeyboardInterrupt: 