In [None]:
!pip3 install pandas
!pip3 install --upgrade tensorflow-gpu

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import pandas as pd
import numpy as np
import glob
import time
import sys

import tensorflow as tf
tf.compat.v1.disable_eager_execution()
from tensorflow.keras import models, layers
from tensorflow.keras.layers import Lambda, Input, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

In [None]:
def get_files(path):
    
    all_files = glob.glob(os.path.join(path , '*.csv'))

    files_list = []
    for file in all_files:
        df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
        files_list.append(df)
        
    df = pd.concat(files_list, axis = 0, ignore_index = True)
    
    # Get only the attacks
    df = df.query('attack == 1')
    df = df.drop(columns=['attack'])
    
    # Drop irrelevant information
    df = df.drop(columns=['pkSeqID', 'stime', 'flgs', 'flgs_number', 'saddr', 'sport', 'daddr', 'dport', 'subcategory'])
    
    # Categorical to numerical
    df['proto'] = df['proto'].map({'tcp': 1, 'arp': 2, 'udp': 3, 'icmp': 4, 'ipv6-icmp': 5})
    df['state'] = df['state'].map({'REQ': 1, 'RST': 2, 'ACC': 3, 'CON': 4, 'INT': 5, 'URP': 6, 'FIN': 7, 'NRS': 8, 'ECO': 9, 'TST': 10, 'MAS': 11})
    df['category'] = df['category'].map({'DDoS': 0, 'DoS': 1, 'Reconnaissance': 2, 'Theft': 3})
    
    return df

In [None]:
def get_files_nbaiot_mirai(path):
    
    all_files = glob.glob(os.path.join(path , '*.csv'))

    files_list = []
    for file in all_files:
        if ('ack' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 0
        elif ('scan' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 1
        elif ('syn' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 2
        elif ('udp' in file and 'udpplain' not in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 3
        elif ('udpplain' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 4
            
        files_list.append(df)
        
    df = pd.concat(files_list, axis = 0, ignore_index = True)
        
    return df

In [None]:
def get_files_nbaiot_gafgyt(path):
    
    all_files = glob.glob(os.path.join(path , '*.csv'))

    files_list = []
    for file in all_files:
        if ('combo' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 5
        elif ('junk' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 6
        elif ('scan' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 7
        elif ('tcp' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 8
        elif ('udp' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 9
            
        files_list.append(df)
        
    df = pd.concat(files_list, axis = 0, ignore_index = True)
        
    return df

In [None]:
def get_files_nbaiot_gafgyt_edsw(path):
    
    all_files = glob.glob(os.path.join(path , '*.csv'))

    files_list = []
    for file in all_files:
        if ('combo' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 0
        elif ('junk' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 1
        elif ('scan' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 2
        elif ('tcp' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 3
        elif ('udp' in file):
            df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
            df['category'] = 4
            
        files_list.append(df)
        
    df = pd.concat(files_list, axis = 0, ignore_index = True)
        
    return df

In [None]:
def get_attack(attack_type, df):
    
    df = df.query('category==' + str(attack_type))
    label = df.pop('category') 
    df = df.sample(frac=1)
    
    return df, label

In [None]:
# CNN model with 1D convolutional layer for nbaiot dataset

def CNN(feature, depth, number_classes):
    
    model = models.Sequential()
    model.add(layers.Conv1D(32, 3, activation = 'relu', padding = 'same', kernel_initializer = 'he_uniform', input_shape = (feature, depth)))
    model.add(layers.MaxPooling1D(pool_size = 2, strides = 2))
    model.add(layers.Conv1D(64, 3, activation='relu', padding = 'same', kernel_initializer = 'he_uniform'))
    model.add(layers.MaxPooling1D(pool_size = 2, strides = 2))
    model.add(layers.Conv1D(64, 3, activation='relu', padding = 'same', kernel_initializer = 'he_uniform'))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation = 'relu'))
    model.add(layers.Dense(number_classes, activation = 'softmax'))
    
    return model

In [None]:
def train(X_train, Y_train, X_val, Y_val, epochs, batch_size, reduce_lr, model):
    
    train_start = time.time()

    history = model.fit(X_train, Y_train, 
                        epochs = epochs, 
                        batch_size = batch_size, 
                        validation_data = (X_val, Y_val),
                        callbacks = [reduce_lr])

    train_end = time.time()
    train_time = train_end - train_start
    print("Training time:", train_time)
    
    return model, train_time

In [None]:
def test_openset(X_test, model):
    
    test_start = time.time()

    Y_pred = model.predict(X_test)

    test_end = time.time()
    test_time = test_end - test_start
    print("Testing time:", test_time)
    
    pred = []
    for idx, x in enumerate(Y_pred[:100]):
        print(x)
        aux = []
        for i in x:
            if i < 0.1:
                aux.append(False)
            else:
                aux.append(True)
        
        print(aux)
        if True in aux:
            pred.append(np.argmax(np.asarray(x)))
        else:
            pred.append(2)
        
    pred = np.asarray(pred)
    
    return pred, test_time

In [None]:
def test(X_test, model):
    
    test_start = time.time()

    Y_pred = model.predict(X_test)

    test_end = time.time()
    test_time = test_end - test_start
    print("Testing time:", test_time)
    
    pred = np.argmax(Y_pred, axis = 1)
    
    return pred, test_time

In [None]:
def get_scores(Y_test, Y_pred, labels):
    
    acc = accuracy_score(Y_test, Y_pred) 
    f1 = f1_score(Y_test, Y_pred, average = 'weighted')
    pre = precision_score(Y_test, Y_pred, labels = None, pos_label = 1, average = 'weighted')
    rec = recall_score(Y_test, Y_pred, labels = None, pos_label = 1, average = 'weighted', sample_weight = None)
    
    return acc, f1, pre, rec

In [None]:
def print_results(learning_rate,
                  epochs,
                  batch_size,
                  X_train,
                  X_val,
                  X_test,
                  opt_time,
                  train_time,
                  test_time,
                  acc,
                  f1,
                  pre,
                  rec,
                  Y_test,
                  Y_pred,
                  model_type,
                  path):
    
    stdout_obj = sys.stdout
    sys.stdout = open(path, "a")

    print("==== Experiment " + model_type + " ====")
    print("Learning rate:" + str(learning_rate) + " - Epochs:" + str(epochs) + " - Batch size:" + str(batch_size))
    print("Training size:" + str(len(X_train)) + " - Testing size:" + str(len(X_test)))
    print("Optimization time:" + str(opt_time) + " - Training time:" + str(train_time) + " - Testing time:" + str(test_time))
    print("Accuracy:" + str(acc))
    print("F1-score:" + str(f1))
    print("Precision:" + str(pre))
    print("Recall:" + str(rec))
    print(classification_report(Y_test, Y_pred, digits = 5))
    print("=================================================================")

    sys.stdout = stdout_obj

In [None]:
def train_nb_knn(X_train, Y_train, nb_knn_model):
    
    train_start = time.time()

    nb_knn_model.fit(X_train, Y_train)

    train_end = time.time()
    train_time = train_end - train_start
    
    return nb_knn_model, train_time

In [None]:
def test_nb_knn(X_test, nb_knn_model):
    
    test_start = time.time()

    Y_pred = nb_knn_model.predict(X_test)

    test_end = time.time()
    test_time = test_end - test_start
    print("Testing time:", test_time)
        
    return Y_pred, test_time