In [1]:
!pip3 install pandas
!pip3 install --upgrade tensorflow-gpu

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import numpy as np
import pandas as pd
import sys
import glob
import sys
import time

import tensorflow as tf
from tensorflow.keras.losses import mse
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Lambda, Input, Dense
from tensorflow.keras import backend as K
from tensorflow.keras import optimizers

from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0mDefaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
def get_number_features(df, number_features):
    
    columns = list(df.columns)
    chosen_columns = []
    
    if number_features == 92:
        for column in columns:
            if column.find('L5') != -1 or column.find('L3') != -1 or column.find('L1') != -1 or column.find('L=0.1') != -1:
                chosen_columns.append(column)
        df = pd.DataFrame(df, columns = chosen_columns)
        return df
    elif number_features == 69:
        for column in columns:
            if column.find('L5') != -1 or column.find('L3') != -1 or column.find('L1') != -1:
                chosen_columns.append(column)
        df = pd.DataFrame(df, columns = chosen_columns)
        return df
    elif number_features == 46:
        for column in columns:
            if column.find('L5') != -1 or column.find('L3') != -1:
                chosen_columns.append(column)
        df = pd.DataFrame(df, columns = chosen_columns)
        return df
    elif number_features == 23:
        for column in columns:
            if column.find('L5') != -1:
                chosen_columns.append(column)
        df = pd.DataFrame(df, columns = chosen_columns)
        return df
    else:
        return df

In [3]:
def get_files(path, number_features):
    
    all_files = glob.glob(os.path.join(path , '*.csv'))

    files_list = []
    for file in all_files:
        df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
        files_list.append(df)
        
    df = pd.concat(files_list, axis = 0, ignore_index = True)
    df = get_number_features(df, number_features)
    
    return df

In [4]:
def get_files_botiot(path):
    
    all_files = glob.glob(os.path.join(path , '*.csv'))

    files_list = []
    for file in all_files:
        df = pd.read_csv(file, index_col = None, encoding = 'utf-8', sep = ',', low_memory = False)
        files_list.append(df)
        
    df = pd.concat(files_list, axis = 0, ignore_index = True)
    
    # Drop irrelevant information
    df = df.drop(columns=['pkSeqID', 'stime', 'flgs', 'flgs_number', 'saddr', 'sport', 'daddr', 'dport', 'subcategory', 'category'])
    
    # Categorical to numerical
    df['proto'] = df['proto'].map({'tcp': 1, 'arp': 2, 'udp': 3, 'icmp': 4, 'ipv6-icmp': 5})
    df['state'] = df['state'].map({'REQ': 1, 'RST': 2, 'ACC': 3, 'CON': 4, 'INT': 5, 'URP': 6, 'FIN': 7, 'NRS': 8, 'ECO': 9, 'TST': 10, 'MAS': 11})
    
    df_benign = df
    df_attack = df
    
    # Get only benign data
    df_benign = df_benign.query('attack == 0')
    df_benign = df_benign.drop(columns=['attack'])
    
    # Get only the attacks
    df_attack = df_attack.query('attack == 1')
    df_attack = df_attack.drop(columns=['attack'])
        
    return df_benign, df_attack

In [5]:
# Reparameterization trick
# Sample the normally distributed z - mean + sigma * epsilon. The epsilon ensures the continuity of latent space and helps
# the network to keep correcting its parameters through backpropagation

def reparametrization(args):
    
    z_mean, z_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape = (batch, dim))
    
    return z_mean + K.exp(0.5 * z_var) * epsilon

In [6]:
# Get error term
# Calculates the error between the original vector and the predicted one

def get_error_term(v1, v2, _rmse = True):
    
    if _rmse:
        return np.sqrt(np.mean((v1 - v2) ** 2, axis = 1))
    
    return np.mean(abs(v1 - v2), axis = 1)

In [7]:
# Encoder Model
# The encoder learns a function that takes an input array of size n and can generate two vectors that represents the
# parameters (mean and variance) of a distribution from which the latent vector is sampled.

# encoder(input_vector[]) => latent_v_mu[], latent_v_lvar[]
# So that - latent_v[0] ~  N(latent_v_mu[0], latent_v_lvar[0])
# and latent_v[1] ~  N(latent_v_mu[1], latent_v_lvar[1])

def vae_encoder(input_shape, intermediate_dim, latent_dim, reparametrization):
    
    inputs = Input(shape = input_shape, name = 'encoder_input')
    x = Dense(intermediate_dim, activation = 'relu')(inputs)

    z_mean = Dense(latent_dim, name = 'z_mean')(x)
    z_var = Dense(latent_dim, name = 'z_var')(x)
    z = Lambda(reparametrization, output_shape = (latent_dim,), name = 'z')([z_mean, z_var])

    encoder = Model(inputs, z, name = 'encoder')
    return inputs, encoder, z_var, z_mean

In [8]:
# Decoder model
# Transforms the latent feature space composed by distributions of mean and variance back to the original input vector

def vae_decoder(intermediate_dim, latent_dim, original_dim):
    
    latent_inputs = Input(shape = (latent_dim,), name = 'z_sampling')
    x = Dense(intermediate_dim, activation = 'relu')(latent_inputs)
    outputs = Dense(original_dim, activation = 'sigmoid')(x)

    # Instantiate the decoder model

    decoder = Model(latent_inputs, outputs, name = 'decoder')
    return decoder

In [9]:
def fit_model(X_train, inputs, outputs, vae_loss, learning_rate, epochs, batch_size):
    
    # Create model
    adam_opt = optimizers.Adam(learning_rate = learning_rate, clipvalue = 0.5)
    model = Model(inputs, outputs, name = 'vae_mlp')
    model.compile(optimizer = adam_opt, loss = vae_loss)

    # Train
    history = model.fit(X_train, X_train, shuffle = True, epochs = epochs, batch_size = batch_size, verbose = 1)
    
    return model

In [10]:
def train(X_train, inputs, outputs, vae_loss, learning_rate, epochs, batch_size):
    
    train_start = time.time()

    model = fit_model(X_train, inputs, outputs, vae_loss, learning_rate, epochs, batch_size)

    train_end = time.time()
    train_time = train_end - train_start
    print("Training time:", train_time)
    
    return model, train_time

In [11]:
def test(X_test, model):
    
    test_start = time.time()

    X_pred = model.predict(X_test)

    test_end = time.time()
    test_time = test_end - test_start
    print("Testing time:", test_time)
    
    return X_pred, test_time

In [12]:
# Get the anomaly threshold based on the error termo between the predicted train set and the real one

def get_anomaly_threshold(X_train, model):
    
    X_pred = model.predict(X_train)
    error_vector = get_error_term(X_pred, X_train, _rmse = False)
    anomaly_threshold = np.quantile(error_vector, 0.99)
    
    return anomaly_threshold

In [13]:
# If the error of the vector is higher than the defined threshold it detects an attack, generating the prediction vector

def get_prediction(Y_test, X_pred, X_test, anomaly_threshold, model):
    
    error_vector = get_error_term(X_pred, X_test, _rmse = False)
    Y_pred = (error_vector > anomaly_threshold)
    Y_pred = Y_pred.astype(int)
    Y_test = Y_test.astype(int)
        
    return Y_test, Y_pred

In [14]:
def get_scores(Y_test, Y_pred):
    
    acc = accuracy_score(Y_test, Y_pred) 
    f1 = f1_score(Y_test, Y_pred)
    pre = precision_score(Y_test, Y_pred)
    rec = recall_score(Y_test, Y_pred)
    
    return acc, f1, pre, rec

In [15]:
def print_results(number_features,
                  learning_rate,
                  epochs,
                  batch_size,
                  anomaly_threshold,
                  X_train,
                  X_test,
                  opt_time,
                  train_time,
                  test_time,
                  acc,
                  f1,
                  pre,
                  rec,
                  Y_test,
                  Y_pred,
                  path):
    
    stdout_obj = sys.stdout
    sys.stdout = open(path, "a")

    print("==== Experiment with " + str(number_features) + " features ====")
    print("Learning rate:" + str(learning_rate) + " - Epochs:" + str(epochs) + " - Batch size:" + str(batch_size) + " - Anomaly threshold:" + str(anomaly_threshold))
    print("Training size:" + str(len(X_train)) + " - Testing size:" + str(len(X_test)))
    print("Optimization time:" + str(opt_time) + " - Training time:" + str(train_time) + " - Testing time:" + str(test_time))
    print("Accuracy:" + str(acc))
    print("F1-score:" + str(f1))
    print("Precision:" + str(pre))
    print("Recall:" + str(rec))
    print(classification_report(Y_test, Y_pred, digits = 5))
    print("=================================================================")

    sys.stdout = stdout_obj