## Se instalan las dependencias usando pip
-----

In [None]:
!pip install keras-tuner -q
!pip install pandas numpy seaborn scikit-learn tensorflow

## Se importan los elementos a usar
-----

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import keras_tuner
from tensorflow import keras
from matplotlib import pyplot as plt
from os import path, mkdir
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from keras import Sequential
from keras.layers import Dense, Dropout, Input, Normalization, BatchNormalization
from keras.optimizers import Adam

## Se definen constantes
-----

In [None]:
# parametros usados para cargar el dataset
DATASET_COLUMNS=['sbytes', 'dbytes', 'smean', 'sload', 'ct_state_ttl', 'sttl', 'dttl', 'rate', 'dur', 'dmean']
DATASET_COLUMN_LABEL_NAME='label'
BASE_PATH = path.join('.', 'datasets', 'unsw-nb15')

# parametros usados para dividir el dataset en validation y test
VALIDATION_SIZE=0.2
RANDOM_STATE=20

# parametros usados para el entrenar el modelo
NUM_EPOCHS=45

# parametros usados para el ajuste de hiperparametros
# cantidad de modelos a probar
NUM_MODELS_TO_TEST=5
# cantidad de veces que se va a probar el modelo
NUM_TEST_PER_MODEL=2
# objetivo a maximizar / minimizar
OBJECTIVE='val_loss'
# ruta de la carpeta raíz que contiene la carpeta que va a guardar los modelos
BASE_MODEL_DIRECTORY=path.join('.')
# carpeta donde se va a guardar el modelo encontrado por keras_tuner
MODEL_DIRECTORY='models'

## Se verifica que exista la carpeta models
-----

In [None]:
MODEL_DIRECTORY_PATH = path.join(BASE_MODEL_DIRECTORY, MODEL_DIRECTORY)

if not path.exists(MODEL_DIRECTORY_PATH):
    mkdir(MODEL_DIRECTORY_PATH)

## Se definen funciones
-----

In [None]:
# basado en la selección de características del paper
# https://www.researchgate.net/publication/320944473_Towards_Developing_Network_forensic_mechanism_for_Botnet_Activities_in_the_IoT_based_on_Machine_Learning_Techniques
def load_dataset(path, columns, label_column_name):
    # se carga el archivo csv como un pandas dataframe
    df = pd.read_csv(path)
    # se quitan las etiquetas del dataset
    labels = df.pop(label_column_name)
    # se transforman a numpy array de tipo float 32
    labels = labels.to_numpy(dtype=np.float32)
    # se obtienen las características del dataframe correspondientes
    features = df[columns]
    # se transforman a numpy array de ti´po float 32
    features = features.to_numpy(dtype=np.float32)
    return features, labels

## Se cargan los vectores de características y las correspondientes etiquetas usados para entrenar el modelo
-----

In [None]:
train_features, train_labels = load_dataset(path.join(BASE_PATH, 'training.csv'), DATASET_COLUMNS, DATASET_COLUMN_LABEL_NAME)

In [None]:
print(train_features)
print(train_labels)

## Se cargan los vectores de características y las correspondientes etiquetas usados para probar el modelo
-----

In [None]:
testing_features, testing_labels = load_dataset(path.join(BASE_PATH, 'testing.csv'), DATASET_COLUMNS, DATASET_COLUMN_LABEL_NAME)

In [None]:
print(testing_features)
print(testing_labels)

## Se divide el conjunto de prueba en validación y prueba
-----

In [None]:
test_features, validation_features, test_labels, validation_labels = train_test_split(testing_features, testing_labels, test_size=VALIDATION_SIZE, random_state=RANDOM_STATE)

In [None]:
print(len(test_labels))
print(len(validation_labels))

In [None]:
print(validation_features)
print(validation_labels)

In [None]:
print(test_features)
print(test_labels)

## Ajuste de hiperparámetros para encontrar el modelo con menor val loss
-----

### Se definen funciones para definir los modelos y crear los modelos mediante ajuste de hiperparámetros
-----

In [None]:
# define la estructura del modelo
def define_model(input_shape, train_features, num_layers, units_selected_per_layer, activation_per_dense_layer,
                 learning_rate, loss, metrics, dropout_before_first_hidden_layer=False, dropout=False,
                 dropout_rate=None, batch_normalization=False):
    # Se define preprocesamiento
    normalization = Normalization()
    normalization.adapt(train_features)
    
    # Definición del modelo
    model = keras.Sequential()
    model.add(Input(shape=input_shape))
    model.add(normalization)
    if dropout_before_first_hidden_layer:
        model.add(Dropout(rate=dropout_rate))
    for layer in range(num_layers):
        if dropout and layer != 0:
            model.add(Dropout(rate=dropout_rate))
        model.add(Dense(units=units_selected_per_layer[layer], activation=activation_per_dense_layer))
        if batch_normalization:
            model.add(BatchNormalization())
    if dropout:
        model.add(Dropout(rate=dropout_rate))
    model.add(Dense(units=1, activation='sigmoid'))

    # Definición de la compilación
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss=loss,
        metrics=metrics
    )

    return model

# crea el modelo
def create_build_model(input_shape, train_features):
    def build_model(hp):
        # parametros para configurar el modelo
        NUM_MIN_LAYERS=1
        NUM_MAX_LAYERS=3
        
        units_selected_per_hidden_layer = []
        activation_per_dense_layer = hp.Choice("activation", ['relu', 'elu', 'leaky_relu'])
        num_hidden_layers = hp.Choice('num_layers', list(range(NUM_MIN_LAYERS, NUM_MAX_LAYERS + 1)))
        dropout_before_first_hidden_layer = hp.Boolean("dropout_before_first_layer")
        dropout = hp.Boolean("dropout")
        dropout_rate = None
        with hp.conditional_scope("dropout", [True]):
            dropout_rate = hp.Choice("dropout_rate", [0.25, 0.3, 0.35, 0.4, 0.45, 0.5])
        batch_normalization = hp.Boolean("batch_normalization")
        # si dropout y batch_normalization son false, se usa por lo menos la capa de batch_normalization
        batch_normalization = True if not (dropout or batch_normalization) else batch_normalization
        # cantidad de neuronas usadas en las capas densas de la red
        units_per_layers = {
            1: [1024, 512, 256, 128, 64, 32],
            2: [512, 256, 128, 64, 32, 16],
            3: [256, 128, 64, 16, 8]
        }

        # utilizado para cambiar el indice de las capas dentro del for
        last_hidden_layer = num_hidden_layers + 1
        
        for layer in range(1, last_hidden_layer):
            with hp.conditional_scope("num_layers", list(range(layer, NUM_MAX_LAYERS + 1))):
                units_in_this_layer = units_per_layers[layer]
                units_selected_per_hidden_layer.append(hp.Choice(f"units_hidden_layer_{layer}", units_in_this_layer))
    
        # parametros para configurar el compilador
        learning_rate = hp.Float("lr", min_value=0.00001, max_value=0.1, sampling="log")
        loss=hp.Choice("loss", ['binary_crossentropy', 'binary_focal_crossentropy'])
        metrics=['accuracy']
    
        model = define_model(input_shape, train_features, num_hidden_layers, units_selected_per_hidden_layer,
                             activation_per_dense_layer, learning_rate, loss, metrics,
                             dropout_before_first_hidden_layer=dropout_before_first_hidden_layer,dropout=dropout,
                             dropout_rate=dropout_rate, batch_normalization=batch_normalization)
        model.summary()
        
        return model
    return build_model

### Se configura la estrategía de búsqueda de hiperparámetros
-----

In [None]:
num_features = train_features.shape[1]
input_shape = (num_features,)

tuner = keras_tuner.BayesianOptimization(
    hypermodel=create_build_model(input_shape, train_features),
    objective=OBJECTIVE,
    max_trials=NUM_MODELS_TO_TEST,
    executions_per_trial=NUM_TEST_PER_MODEL,
    overwrite=True,
    directory=BASE_MODEL_DIRECTORY,
    project_name=MODEL_DIRECTORY
)

In [None]:
tuner.search_space_summary()

### Se realiza la búsqueda de los hiperparámetros
-----

In [None]:
tuner.search(train_features, train_labels, epochs=NUM_EPOCHS, validation_data=(validation_features, validation_labels), shuffle=True)

## Resumen del ajuste de hiperparámetros
-----

In [None]:
tuner.results_summary()

## Obtener el mejor modelo
-----

In [None]:
models = tuner.get_best_models(num_models=1)
best_model = models[0]

## Arquitectura del modelo
-----

In [None]:
best_model.summary()

## Se obtiene nuevamente el conjunto de pruebas completo a partir del conjuto de datos de validación y prueba
-----

In [None]:
test_features = np.concatenate((test_features, validation_features))

In [None]:
test_labels = np.concatenate((test_labels, validation_labels))

## Se realiza la predicción
-----

In [None]:
predict = best_model.predict(test_features)

In [None]:
print(predict)

## Se etiquetan las probabilidades con las etiquetas de la clase a la que pertenecen, utilizando un umbral
-----

In [None]:
#transformar probabilidades en enteros
predict_labels = np.array([1 if x >= 0.5 else 0 for x in predict.flatten()], dtype=np.float32)

In [None]:
print(len(predict_labels))

## Se obtienen las métricas de clasificación
-----

In [None]:
print(classification_report(test_labels, predict_labels))

## Se obtienen las matrices de confusión
-----

In [None]:
result = confusion_matrix(test_labels, predict_labels)
print(result)

### Matriz de confusión que muestra la cantidad de datos clasificados
-----

In [None]:
confusion_matrix_labels = ['Tráfico de red normal', 'Tráfico de red bajo ataque']

In [None]:
matrix = sns.heatmap(result, annot=True, xticklabels=confusion_matrix_labels, yticklabels=confusion_matrix_labels)
_ = matrix.set(xlabel="Valor real", ylabel="Valor predicho")

### Matriz de confusión que muestra el porcentaje de datos clasificados por filas
-----

In [None]:
porcentaje = result / np.sum(result, axis=1).reshape(2,1)
matrix = sns.heatmap(porcentaje, annot=True, xticklabels=confusion_matrix_labels, yticklabels=confusion_matrix_labels)
_ = matrix.set(xlabel="Valor real %", ylabel="Valor predicho %")

## Comparación equiparable a la realizada en el paper https://www.researchgate.net/publication/320944473_Towards_Developing_Network_forensic_mechanism_for_Botnet_Activities_in_the_IoT_based_on_Machine_Learning_Techniques
-----

### Se obtiene el conjunto de datos total al combinar los datasets de entrenamiento, validación y prueba
-----

In [None]:
paper_comparison_features = np.concatenate((train_features, test_features))

In [None]:
paper_comparison_labels = np.concatenate((train_labels, test_labels))

### Se realiza la predicción
-----

In [None]:
paper_comparison_predict = best_model.predict(paper_comparison_features)

In [None]:
print(paper_comparison_predict)

### Se etiquetan las probabilidades con las etiquetas de la clase a la que pertenecen, utilizando un umbral
-----

In [None]:
#transformar probabilidades en enteros
paper_comparison_predict_labels = np.array([1 if x >= 0.5 else 0 for x in paper_comparison_predict.flatten()], dtype=np.float32)

In [None]:
print(len(paper_comparison_predict_labels))

### Se obtienen las métricas de clasificación
-----

In [None]:
print(classification_report(paper_comparison_labels, paper_comparison_predict_labels))

### Se obtienen las matrices de confusión
-----

In [None]:
paper_comparison_result = confusion_matrix(paper_comparison_labels, paper_comparison_predict_labels)
print(paper_comparison_result)

#### Matriz de confusión que muestra la cantidad de datos clasificados
-----

In [None]:
confusion_matrix_labels = ['Tráfico de red normal', 'Tráfico de red bajo ataque']

In [None]:
paper_comparison_matrix = sns.heatmap(paper_comparison_result, annot=True, xticklabels=confusion_matrix_labels, yticklabels=confusion_matrix_labels)
_ = paper_comparison_matrix.set(xlabel="Valor real", ylabel="Valor predicho")

#### Matriz de confusión que muestra el porcentaje de datos clasificados por filas
-----

In [None]:
paper_comparison_porcentaje = paper_comparison_result / np.sum(paper_comparison_result, axis=1).reshape(2,1)
paper_comparison_matrix = sns.heatmap(paper_comparison_porcentaje, annot=True, xticklabels=confusion_matrix_labels, yticklabels=confusion_matrix_labels)
_ = paper_comparison_matrix.set(xlabel="Valor real %", ylabel="Valor predicho %")