In [1]:
import os

import pandas as pd

from framework.dataset_specification import NamedDatasetSpecifications
from framework.enumerations import EvaluationDatasetSampling
from framework.flow_transformer_binary_classification import FlowTransformer
from framework.flow_transformer_parameters import FlowTransformerParameters
from implementations.classification_heads import *
from implementations.input_encodings import * 
from implementations.pre_processings import StandardPreProcessing
from implementations.transformers.basic_transformers import BasicTransformer
from implementations.transformers.named_transformers import *


2024-10-13 13:31:53.571491: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-13 13:31:53.580494: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-13 13:31:53.589943: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-13 13:31:53.592750: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-13 13:31:53.600490: I tensorflow/core/platform/cpu_feature_guar

In [2]:
encodings = [
    NoInputEncoder(),
    RecordLevelEmbed(64),
    CategoricalFeatureEmbed(EmbedLayerType.Dense, 16),
    CategoricalFeatureEmbed(EmbedLayerType.Lookup, 16),
    CategoricalFeatureEmbed(EmbedLayerType.Projection, 16),
    RecordLevelEmbed(64, project=True)
]

classification_heads = [
    LastTokenClassificationHead(),
    FlattenClassificationHead(),
    GlobalAveragePoolingClassificationHead(),
    CLSTokenClassificationHead(),
    FeaturewiseEmbedding(project=False),
    FeaturewiseEmbedding(project=True),
]

transformers = [
    BasicTransformer(2, 128, n_heads=2),
    BasicTransformer(2, 128, n_heads=2, is_decoder=True),
    GPTSmallTransformer(),
    BERTSmallTransformer()
]

In [3]:
flow_file_path = "./dataset"

datasets = [
    ("CSE_CIC_IDS", os.path.join(flow_file_path, "NF-CSE-CIC-IDS2018-v2.csv"), NamedDatasetSpecifications.unified_flow_format, 0.01, EvaluationDatasetSampling.LastRows),
    ("UNSW_NB15", os.path.join(flow_file_path, "NF-UNSW-NB15-v2.csv"), NamedDatasetSpecifications.unified_flow_format, 0.025, EvaluationDatasetSampling.LastRows),
    ("UNSW_NB15-2", os.path.join(flow_file_path, "NF-UNSW-NB15-v2.csv"), NamedDatasetSpecifications.unified_flow_format, 0.1, EvaluationDatasetSampling.LastRows),
]


In [4]:
pre_processing = StandardPreProcessing(n_categorical_levels=32)

# Define the transformer
ft = FlowTransformer(pre_processing=pre_processing,
                     input_encoding=encodings[5],
                     sequential_model=transformers[1],
                     classification_head=classification_heads[5],
                     params=FlowTransformerParameters(window_size=10, mlp_layer_sizes=[128], mlp_dropout=0.1))

# Load the specific dataset
cache_folder = './content/cache_folder'
dataset_name, dataset_path, dataset_specification, eval_percent, eval_method = datasets[1]
ft.load_dataset(dataset_name, dataset_path, dataset_specification, cache_folder, evaluation_percent=eval_percent)


Using cache file path: ./content/cache_folder/UNSW_NB15_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_VzQ981ONg0PHPxtLtik6rZN0dGw0.feather
Reading directly from cache ./content/cache_folder/UNSW_NB15_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_VzQ981ONg0PHPxtLtik6rZN0dGw0.feather...


Unnamed: 0,DURATION_OUT,OUT_BYTES,NUM_PKTS_512_TO_1024_BYTES,MIN_IP_PKT_LEN,NUM_PKTS_1024_TO_1514_BYTES,MAX_IP_PKT_LEN,RETRANSMITTED_IN_PKTS,TCP_WIN_MAX_IN,NUM_PKTS_UP_TO_128_BYTES,RETRANSMITTED_IN_BYTES,...,L7_PROTO_23,L7_PROTO_24,L7_PROTO_25,L7_PROTO_26,L7_PROTO_27,L7_PROTO_28,L7_PROTO_29,L7_PROTO_30,L7_PROTO_31,L7_PROTO_32
0,0.0,0.319254,0.000000,0.608608,0.000000,0.565324,0.000000,0.000000,0.326896,0.000000,...,False,False,False,False,False,False,False,False,False,False
1,0.0,0.372881,0.000000,0.608608,0.000000,0.565324,0.074172,0.817813,0.374701,0.261166,...,False,False,False,False,False,False,False,False,False,False
2,0.0,0.401284,0.000000,0.608608,0.000000,0.575948,0.117560,0.831711,0.408620,0.295819,...,False,False,False,False,False,False,False,False,False,False
3,0.0,0.421883,0.000000,0.608608,0.000000,0.598516,0.148344,0.843750,0.434929,0.317712,...,False,False,False,False,False,False,False,False,False,False
4,0.0,0.442193,0.000000,0.608608,0.000000,0.598516,0.172222,0.863869,0.465863,0.334477,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390270,0.0,0.470805,0.164319,0.608608,0.000000,0.915812,0.148344,0.831711,0.369683,0.358198,...,False,False,False,False,False,False,False,False,False,False
2390271,0.0,0.314377,0.000000,0.659772,0.000000,0.565324,0.000000,0.000000,0.189757,0.000000,...,False,False,False,False,False,False,False,False,False,False
2390272,0.0,0.551102,0.000000,0.608608,0.196811,0.999258,0.074172,0.831711,0.302415,0.342133,...,False,False,False,False,False,False,False,False,False,False
2390273,0.0,0.667044,0.000000,0.608608,0.385009,0.984754,0.208227,0.965882,0.536915,0.381469,...,False,False,False,False,False,False,False,False,False,False


In [5]:
m = ft.build_model()
m.summary()


I0000 00:00:1728797549.316170 1396941 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1728797549.341581 1396941 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1728797549.341697 1396941 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1728797549.343262 1396941 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

In [6]:
m.compile(optimizer="adam", loss='binary_crossentropy', metrics=['binary_accuracy'], jit_compile=True)

# Get the evaluation results
eval_results: pd.DataFrame
(train_results, eval_results, final_epoch) = ft.evaluate(m, batch_size=128, epochs=5, steps_per_epoch=64, early_stopping_patience=5)

print(eval_results)

Attack classes in eval set:
	Exploits
	Generic
	Fuzzers
	Backdoor
	DoS
	Reconnaissance
	Shellcode
	Worms
	Analysis
Building eval dataset...
Splitting dataset to featurewise...
Evaluation dataset is built!

Positive samples in eval set: 2816
Negative samples in eval set: 56931

Computing and print the number of each attack class in the evaluation set...
Attack class Exploits samples in eval set: 1285
Attack class Generic samples in eval set: 143
Attack class Fuzzers samples in eval set: 639
Attack class Backdoor samples in eval set: 19
Attack class DoS samples in eval set: 182
Attack class Reconnaissance samples in eval set: 481
Attack class Shellcode samples in eval set: 57
Attack class Worms samples in eval set: 7
Attack class Analysis samples in eval set: 3


I0000 00:00:1728797630.017233 1397055 service.cc:146] XLA service 0x7d7440003080 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728797630.017257 1397055 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 4080, Compute Capability 8.9
2024-10-13 13:33:50.090783: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-10-13 13:33:50.708808: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907




I0000 00:00:1728797638.839866 1397055 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch = 0 / 5 (early stop in 5), step = 0, loss = 0.69497, results = [array(0.694968, dtype=float32), array(0.515625, dtype=float32)] -- elapsed (train): 0.00s
Epoch = 0 / 5 (early stop in 5), step = 5, loss = 0.67744, results = [array(0.67744476, dtype=float32), array(0.57161456, dtype=float32)] -- elapsed (train): 2.11s
Epoch = 0 / 5 (early stop in 5), step = 10, loss = 0.63927, results = [array(0.63926977, dtype=float32), array(0.6328125, dtype=float32)] -- elapsed (train): 3.87s
Epoch = 0 / 5 (early stop in 5), step = 15, loss = 0.58121, results = [array(0.581212, dtype=float32), array(0.69189453, dtype=float32)] -- elapsed (train): 5.63s
Epoch = 0 / 5 (early stop in 5), step = 20, loss = 0.50781, results = [array(0.50780743, dtype=float32), array(0.74404764, dtype=float32)] -- elapsed (train): 7.39s
Epoch = 0 / 5 (early stop in 5), step = 25, loss = 0.44611, results = [array(0.44611388, dtype=float32), array(0.77944714, dtype=float32)] -- elapsed (train): 9.16s
Epoch = 0 / 5 (earl