In [None]:
!pip install tensorflow
!pip install numpy
!pip install pandas

In [1]:
#!/usr/bin/env python

__author__ = ["Jason Gardner"]
__credits__ = ["Jason Gardner"]
__license__ = "MIT"
__version__ = "0.0.1"
__maintainer__ = ["Jason Gardner"]
__email__ = ["n01480000@unf.edu"]
__status__ = "Development"

import numpy as np
import pandas as pd
import tensorflow as tf
import argparse
import logging
import os
import sys
import json
import uuid
import pickle
from tensorflow.keras import callbacks
from tensorflow.keras import layers
from tensorflow.keras.metrics import Precision, Recall
import gc


SEED = 42
EPOCHS = 10
METRICS = ["accuracy"]
BATCH_SIZE = 32
SEQUENCE_SIZE = 100
TEST_RATIO= 0.2
LOG_FORMAT_STRING = logging.Formatter("%(asctime)s — %(name)s — %(funcName)s:%(lineno)d — %(message)s")
RMSPROP_CLIP = 10.0
FILENAME = "/data/NF-UQ-NIDS-v2.csv"
TEST_FILENAME = "/data/NF-UQ-NIDS-v2_truncated.csv"
INPUT_SHAPE = (SEQUENCE_SIZE, 43)

np.random.seed(SEED)

class Data:
    def __init__(self, test: bool = False) -> None:
        self.test = test
        self.data = self.load_data()
        self.train_x, self.test_x, self.train_y, self.test_y = self._process_data()
        
    def load_data(self) -> pd.DataFrame:
        if self.test:
            logger.info(f"Loading data from {TEST_FILENAME}")
            data = pd.read_csv(TEST_FILENAME)
        else:
            logger.info(f"Loading data from {FILENAME}")
            data = pd.read_csv(FILENAME)
        
        return data

    def _process_data(self) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        logger.info("Processing data")
        logger.info("Converting IP addresses to integers")
        self.data["IPV4_SRC_ADDR"] = self.data["IPV4_SRC_ADDR"].apply(lambda x: int(x.replace(".", "")))
        self.data["IPV4_DST_ADDR"] = self.data["IPV4_DST_ADDR"].apply(lambda x: int(x.replace(".", "")))
        logger.info("Dropping Attack and Dataset columns")
        self.data.drop("Attack", axis = 1, inplace = True)
        self.data.drop("Dataset", axis = 1, inplace = True)
        train_x, test_x, train_y, test_y = self._split_data(self.data)
        logger.info(f"Data dimensions: {self.data.shape}")
        logger.info(f"Training data dimensions: {train_x.shape}")
        logger.info(f"Testing data dimensions: {test_x.shape}")
        
        return train_x, test_x, train_y, test_y
    
    def _split_data(self, data) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        train_size = int(len(data) * (1 - TEST_RATIO))
        training_data, testing_data = data[:train_size], data[train_size:]
 
        train_x = training_data.drop("Label", axis=1).copy()
        train_y = training_data["Label"].copy()

        test_x = testing_data.drop("Label", axis=1).copy()
        test_y = testing_data["Label"].copy()
        
        self._normalize_data(train_x)
        self._normalize_data(test_x)
        
        train_x = self._pad_data(np.array(train_x), 43)
        train_x = np.array(train_x).reshape(-1, SEQUENCE_SIZE, 43)
        train_y = self._pad_data(np.array(train_y))
        train_y = np.array(train_y).reshape(-1, SEQUENCE_SIZE, 1)
        test_x = self._pad_data(np.array(train_x), 43)
        test_x = np.array(train_x).reshape(-1, SEQUENCE_SIZE, 43)
        test_y = self._pad_data(np.array(train_y))
        test_y = np.array(train_y).reshape(-1, SEQUENCE_SIZE, 1)
        
        logger.info(f"Reshaped train_x shape: {train_x.shape}")
        logger.info(f"Reshaped train_y shape: {train_y.shape}")

        return train_x, test_x, train_y, test_y
    
    def _pad_data(self, data, num_features = None):
        if data.ndim == 1:
            total_rows = data.shape[0]
            rows_needed = SEQUENCE_SIZE - (total_rows % SEQUENCE_SIZE) if total_rows % SEQUENCE_SIZE != 0 else 0
            
            if rows_needed > 0:
                padding = np.zeros(rows_needed, dtype=data.dtype)
                data = np.concatenate([data, padding], axis=0)
        elif data.ndim == 2:
            total_rows = data.shape[0]
            rows_needed = SEQUENCE_SIZE - (total_rows % SEQUENCE_SIZE) if total_rows % SEQUENCE_SIZE != 0 else 0
            
            if rows_needed > 0:
                padding = np.zeros((rows_needed, num_features), dtype=data.dtype)
                data = np.concatenate([data, padding], axis=0)
        
        return data
    
    def _normalize_data(self, data) -> pd.DataFrame:
        logger.info("Normalizing data")
        for column in data.columns:
            mean = data[column].mean()
            std = data[column].std()
            if column != "Label":
                data[column] = (data[column] - mean) / std
        
        return data

    def get_data(self) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        
        return self.train_x, self.test_x, self.train_y, self.test_y
    
###############
###  MODEL  ###
###############

class Model:
    def __init__(self, state : dict = None, size: int = None) -> None:
        self.state = state
        self.size = size
        self.uuid = self.state['uuid']

        if self.state['uuid'] is not None:
            print(f"Loading model from {self.state['uuid']}")
            self.load_model()
        else:
            logger.info("No model UUID provided. Generating new UUID")
            self.state['uuid'] = str(uuid.uuid4())
            self.uuid = self.state['uuid']
            logger.info(f"New UUID: {self.uuid}")
            logger.info(f"Initializing new model")
            self.init_model()

    def init_model(self) -> tf.keras.Model:
        logger.info("Initializing model")
        self.model = tf.keras.models.Sequential([
                        layers.Input(shape=INPUT_SHAPE),
                        layers.Conv1D(filters=32, kernel_size=3, strides=1, activation="relu", padding="same"),
                        layers.MaxPooling1D(pool_size=2, padding="same"),
                        layers.Conv1D(filters=64, kernel_size=3, strides=1, activation="relu", padding="same"),
                        layers.MaxPooling1D(pool_size=2, padding="same"),
                        layers.Conv1D(filters=128, kernel_size=3, strides=1, activation="relu", padding="same"),
                        layers.MaxPooling1D(pool_size=2, padding="same"),
                        layers.LSTM(128, activation="relu", return_sequences=True),
                        layers.LSTM(64, activation="relu"),
                        layers.Dense(1, activation="sigmoid")
                    ])
      
        self.optimizer = tf.keras.optimizers.RMSprop(clipvalue = RMSPROP_CLIP)
        # self.optimizer = tf.keras.optimizers.Adam(lr=1e-3)
        self.loss_function = tf.keras.losses.Huber()
        # self.loss_function = tf.keras.losses.mean_squared_error
        self.model.compile(optimizer = self.optimizer, loss = self.loss_function, metrics = METRICS)
        logger.info("Model initialized")
           
    def save_model(self) -> None:
        logger.info(f"Saving model to './models/{self.uuid}/model.h5'")
        logger.info(f"Creating directory './models/{self.state['uuid']}'")
        results_dir_path = f"./models/{self.state['uuid']}"
        if not os.path.exists(results_dir_path):
            if not os.path.exists('./models'):
                try:
                    os.mkdir('./models')
                except OSError:
                    logger.warning(f"Creation of the directory {'./models'} failed")
                    exit(1)
            try:
                os.mkdir(results_dir_path)
            except OSError:
                logger.warning(f"Creation of the directory {results_dir_path} failed")
                exit(1)
            else:
                logger.info(f"Successfully created the directory {results_dir_path}")

        self.model.save_weights(f'./models/{self.state["uuid"]}/model.weights.h5')
        logger.info(f"Saved model weights to ./models/{self.state['uuid']}/model.weights.h5")

        logger.info(f"Saving Numpy random state to ./models/{self.state['uuid']}/numpy_random_state.pkl")
        with open(f"./models/{self.state['uuid']}/numpy_random_state.pkl", 'wb') as f:
            pickle.dump(np.random.get_state(), f)
        logger.info(f"Saved Numpy random state to ./models/{self.state['uuid']}/random_state.npy")

        with open(f'./models/{self.state["uuid"]}/state.json', 'w') as f:
            logger.info(f"Saving Network with UUID {self.state['uuid']}")
            print(self.state)
            json.dump(self.state, f)
            logger.info(f"Saved state to './models/{self.uuid}/state.json'")

    def load_model(self) -> tf.keras.Model:
        self.model = tf.keras.Model()
        logger.info(f"Loading model from './models/{self.uuid}/model.weights.h5'")
        self.model.load_weights(f'./models/{self.uuid}/model.weights.h5')
        logger.info(f"Loaded model weights from './models/{self.uuid}/model.weights.h5'")
        
        with open(f"./models/{self.state['uuid']}/numpy_random_state.pkl", 'rb') as f:
            random_state = pickle.load(f)
            np.random.set_state(random_state)
        logger.info(f"Loaded Numpy random state from './models/{self.uuid}/numpy_random_state.pkl'")

        if os.path.exists(f'./models/{self.uuid}/state.json'):
            with open(f'./models/{self.uuid}/state.json', 'r') as f:
                logger.info(f"Loading state from './models/{self.uuid}/state.json'")
                self.state = json.load(f)
                logger.info(f"Loading {self.state['network']} ({self.state['uuid']}) network environment.\n Game: {self.state['environment']}")
                logger.info(f"Loaded state from './models/{self.uuid}/state.json'")     

        return self.model

###############
### LOGGING ###
###############
         
class Logging:
    def __init__(self, logger_name: str = '__main__') -> None:
        self.logger = logging.getLogger(logger_name)
        self.logger.setLevel(logging.DEBUG)
        self.logger.addHandler(self.get_console_handler())
        self.logger.propagate = False
        self.logger.info(f"Logging initialized -- {logger_name}")

    def get_console_handler(self) -> logging.StreamHandler:
        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setFormatter(LOG_FORMAT_STRING)

        return console_handler

    def get_logger(self) -> logging.Logger:

        return self.logger
      
###############
###  AGENT  ###
###############
      
class Agent:
    def __init__(self, state: dict, data : tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]) -> None:
        self.uuid = state['uuid']
        self.optimizer = tf.keras.optimizers.RMSprop(clipvalue = RMSPROP_CLIP)
        self.loss_function = tf.keras.losses.Huber()
        self.train_x, self.test_x, self.train_y, self.test_y = data
        self.M = Model(state = parser.data, size = len(self.train_y))
        self.model = self.M.model
        
    def run(self) -> None:
        print(f"Data: {self.train_x.shape}")
        earlystopping = callbacks.EarlyStopping(monitor = "val_loss",
                                        mode = "min",
                                        patience = 5,
                                        restore_best_weights = True)
        history = self.model.fit(self.train_x, 
                       self.train_y, 
                       epochs = EPOCHS, 
                       batch_size = BATCH_SIZE, 
                       verbose = 1, 
                       validation_data = (self.test_x, self.test_y)
                       , callbacks = [earlystopping])
        self.M.save_model()
        self.model.evaluate(self.test_x, self.test_y)
        self.model.predict(self.test_x)
        hist_df = pd.DataFrame(history.history) 

        hist_csv_file = f"./models/{parser.data['uuid']}/history.csv"
        with open(hist_csv_file, mode='w') as f:
            hist_df.to_csv(f)

##########################
###  ARGUMENT PARSING  ###
##########################

class Parsing:
    def __init__(self):
        self.data = dict()
        self.data['seed'] = 42
        self.data['uuid'] = None

###############
###  MAIN   ###
###############

if __name__ == '__main__':
    gc.collect()
    logger = Logging().get_logger()
    parser = Parsing()
    data = Data().get_data()
    agent = Agent(parser.data, data)
    agent.run()
    logger.info("Training complete")
    exit(0)

2024-04-20 21:09:03.943794: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2024-04-20 21:09:05,301 — __main__ — __init__:237 — Logging initialized -- __main__
2024-04-20 21:09:05,302 — __main__ — load_data:52 — Loading data from /data/NF-UQ-NIDS-v2.csv
2024-04-20 21:11:48,870 — __main__ — _process_data:58 — Processing data
2024-04-20 21:11:48,870 — __main__ — _process_data:59 — Converting IP addresses to integers
2024-04-20 21:13:15,546 — __main__ — _process_data:62 — Dropping Attack and Dataset columns
2024-04-20 21:13:35,380 — __main__ — _normalize_data:118 — Normalizing data


  sqr = _ensure_numeric((avg - values) ** 2)


2024-04-20 21:13:54,082 — __main__ — _normalize_data:118 — Normalizing data
2024-04-20 21:14:17,254 — __main__ — _split_data:94 — Reshaped train_x shape: (607904, 100, 43)
2024-04-20 21:14:17,255 — __main__ — _split_data:95 — Reshaped train_y shape: (607904, 100, 1)
2024-04-20 21:14:17,255 — __main__ — _process_data:66 — Data dimensions: (75987976, 44)
2024-04-20 21:14:17,256 — __main__ — _process_data:67 — Training data dimensions: (607904, 100, 43)
2024-04-20 21:14:17,256 — __main__ — _process_data:68 — Testing data dimensions: (607904, 100, 43)
2024-04-20 21:14:17,435 — __main__ — __init__:145 — No model UUID provided. Generating new UUID
2024-04-20 21:14:17,436 — __main__ — __init__:148 — New UUID: 2f50e09c-e5fa-4c13-8592-74a2c816937e
2024-04-20 21:14:17,436 — __main__ — __init__:149 — Initializing new model
2024-04-20 21:14:17,437 — __main__ — init_model:153 — Initializing model
2024-04-20 21:14:17,498 — __main__ — init_model:172 — Model initialized


2024-04-20 21:14:17.381460: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-20 21:14:17.381799: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-20 21:14:17.382076: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Data: (607904, 100, 43)
Epoch 1/10


2024-04-20 21:14:20.615201: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 10455948800 exceeds 10% of free system memory.


[1m18995/18997[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 16ms/step - accuracy: 0.6689 - loss: 0.1102

2024-04-20 21:19:28.761036: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 10455948800 exceeds 10% of free system memory.


[1m18997/18997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m426s[0m 22ms/step - accuracy: 0.6689 - loss: 0.1102 - val_accuracy: 0.6688 - val_loss: 0.1100
Epoch 2/10
[1m18997/18997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 22ms/step - accuracy: 0.6688 - loss: 0.1100 - val_accuracy: 0.6688 - val_loss: 0.1100
Epoch 3/10
[1m18997/18997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 22ms/step - accuracy: 0.6689 - loss: 0.1099 - val_accuracy: 0.6688 - val_loss: 0.1099
Epoch 4/10
[1m18997/18997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 22ms/step - accuracy: 0.6689 - loss: 0.1099 - val_accuracy: 0.6688 - val_loss: 0.1099
Epoch 5/10
[1m18997/18997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m414s[0m 22ms/step - accuracy: 0.6688 - loss: 0.1099 - val_accuracy: 0.6688 - val_loss: 0.1099
Epoch 6/10
[1m18997/18997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m414s[0m 22ms/step - accuracy: 0.6689 - loss: 0.1099 - val_accuracy: 0.6688 - val_loss: 0.10

2024-04-20 22:23:41.081280: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 10455948800 exceeds 10% of free system memory.


[1m18997/18997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 6ms/step - accuracy: 0.6688 - loss: 0.1098


2024-04-20 22:25:49.990259: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 10455948800 exceeds 10% of free system memory.


[1m18997/18997[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 6ms/step
2024-04-20 22:28:00,752 — __main__ — <module>:305 — Training complete
