**DSCI 565: Semester Project**

Reference Paper:

FlowTransformer: A Transformer Framework for Flow-based Network Intrusion Detection Systems

Dataset:

Towards a Standard Feature Set for Network Intrusion Detection System Datasets (NetFlow-v2)

https://staff.itee.uq.edu.au/marius/NIDS_datasets/

NF-UNSW-NB15-v2, NF-CSE-CIC-IDS2018-v2

Keyword: REQ

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, sys
# framework library
# https://github.com/liamdm/FlowTransformer
path = "/content/drive/MyDrive/FlowTransformer/" # /FlowTransformer/ folder

if path not in sys.path:
    sys.path.append(path)

import warnings
warnings.filterwarnings("ignore")
from typing import Tuple, List, Dict, Any, Optional
from enum import Enum

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

try:
    from tensorflow._api.v2.v2 import keras
except ImportError:
    from tensorflow import keras

import tensorflow as tf
import keras.layers as layers
from keras.layers import Embedding, Dense, Layer, MultiHeadAttention, Dropout, LayerNormalization, Conv1D, Concatenate, Reshape, Flatten, Lambda, GlobalAveragePooling1D

# FlowTransformer framework
from framework.base_preprocessing import BasePreProcessing
from framework.enumerations import CategoricalFormat
from framework.base_input_encoding import BaseInputEncoding
from framework.base_classification_head import BaseClassificationHead
from framework.base_sequential import BaseSequential

Mounted at /content/drive


In [None]:
data_path = path + "data/"
feature = "NetFlow_v2_Features.csv"
datasets = ["NF-CSE-CIC-IDS2018-v2/NF-CSE-CIC-IDS2018-v2.csv", "NF-UNSW-NB15-v2/NF-UNSW-NB15-v2.csv"]

!wc -l drive/MyDrive/FlowTransformer/data/NetFlow_v2_Features.csv  # /FlowTransformer/ folder
!wc -l drive/MyDrive/FlowTransformer/data/NF-CSE-CIC-IDS2018-v2/NF-CSE-CIC-IDS2018-v2.csv  # /FlowTransformer/ folder
!wc -l drive/MyDrive/FlowTransformer/data/NF-UNSW-NB15-v2/NF-UNSW-NB15-v2.csv  # /FlowTransformer/ folder

44 drive/MyDrive/FlowTransformer/data/NetFlow_v2_Features.csv
18893709 drive/MyDrive/FlowTransformer/data/NF-CSE-CIC-IDS2018-v2/NF-CSE-CIC-IDS2018-v2.csv
2390276 drive/MyDrive/FlowTransformer/data/NF-UNSW-NB15-v2/NF-UNSW-NB15-v2.csv


**Implementation**

NetFlow Collector & Pre-processing

In [None]:
# NetFlow Collector
# REQ

# Pre-precessing
# https://github.com/liamdm/FlowTransformer/blob/master/implementations/pre_processings.py
class StandardPreProcessing(BasePreProcessing):
    def __init__(self, n_categorical_levels: int, clip_numerical_values:bool=False):
        super().__init__()
        self.n_categorical_levels:int = n_categorical_levels
        self.clip_numerical_values:bool = clip_numerical_values
        self.min_range = {}
        self.encoded_levels = {}

    @property
    def name(self) -> str:
        return "Standard Preprocessing"

    @property
    def parameters(self) -> dict:
        return {
            "n_categorical_levels": self.n_categorical_levels,
            "clip_numerical_values": self.clip_numerical_values
        }

    def fit_numerical(self, column_name: str, values: np.array):

        v0 = np.min(values)
        v1 = np.max(values)
        r = v1 - v0

        self.min_range[column_name] = (v0, r)

    def transform_numerical(self, column_name: str, values: np.array):
        col_min, col_range = self.min_range[column_name]

        if col_range == 0:
            return np.zeros_like(values, dtype="float32")

        # center on zero
        values -= col_min

        # apply a logarithm
        col_values = np.log(values + 1)

        # scale max to 1
        col_values *= 1. / np.log(col_range + 1)

        if self.clip_numerical_values:
            col_values = np.clip(col_values, 0., 1.)

        return col_values

    def fit_categorical(self, column_name: str, values: np.array):
        levels, level_counts = np.unique(values, return_counts=True)
        sorted_levels = list(sorted(zip(levels, level_counts), key=lambda x: x[1], reverse=True))
        self.encoded_levels[column_name] = [s[0] for s in sorted_levels[:self.n_categorical_levels]]


    def transform_categorical(self, column_name:str, values: np.array, expected_categorical_format: CategoricalFormat):
        encoded_levels = self.encoded_levels[column_name]
        print(f"Encoding the {len(encoded_levels)} levels for {column_name}")

        result_values = np.ones(len(values), dtype="uint32")
        for level_i, level in enumerate(encoded_levels):
            level_mask = values == level

            # we use +1 here, as 0 = previously unseen, and 1 to (n + 1) are the encoded levels
            result_values[level_mask] = level_i + 1

        if expected_categorical_format == CategoricalFormat.Integers:
            return result_values

        v = pd.get_dummies(result_values, prefix=column_name)
        return v

FlowTransformer Framework

In [None]:
# Input Encoder
# ref: https://github.com/liamdm/FlowTransformer/blob/master/implementations/input_encodings.py
class NoInputEncoder(BaseInputEncoding):
    def apply(self, X, prefix:str=None):

        numerical_feature_inputs = X[:self.model_input_specification.n_numeric_features]
        categorical_feature_inputs = X[self.model_input_specification.n_numeric_features:]

        if self.model_input_specification.categorical_format == CategoricalFormat.Integers:
            warnings.warn("It doesn't make sense to be using integer based inputs without encoding!")
            categorical_feature_inputs = [Lambda(lambda x: tf.cast(x, tf.float32))(c) for c in categorical_feature_inputs]

        concat = Concatenate()(numerical_feature_inputs + categorical_feature_inputs)

        return concat

    @property
    def name(self):
        return "No Input Encoding"

    @property
    def parameters(self):
        return {}

    @property
    def required_input_format(self) -> CategoricalFormat:
        return CategoricalFormat.OneHot

class EmbedLayerType(Enum):
    Dense = 0,
    Lookup = 1,
    Projection = 2

class RecordLevelEmbed(BaseInputEncoding):
    def __init__(self, embed_dimension: int, project:bool = False):
        super().__init__()

        self.embed_dimension: int = embed_dimension
        self.project: bool = project

    @property
    def name(self):
        if self.project:
            return "Record Level Projection"
        return "Record Level Embedding"

    @property
    def parameters(self):
        return {
            "dimensions_per_feature": self.embed_dimension
        }

    def apply(self, X:List[keras.Input], prefix: str = None):
        if prefix is None:
            prefix = ""

        assert self.model_input_specification.categorical_format == CategoricalFormat.OneHot

        x = Concatenate(name=f"{prefix}feature_concat", axis=-1)(X)
        x = Dense(self.embed_dimension, activation="linear", use_bias=not self.project, name=f"{prefix}embed")(x)

        return x

    @property
    def required_input_format(self) -> CategoricalFormat:
        return CategoricalFormat.OneHot

class CategoricalFeatureEmbed(BaseInputEncoding):
    def __init__(self, embed_layer_type: EmbedLayerType, dimensions_per_feature: int):
        super().__init__()

        self.dimensions_per_feature: int = dimensions_per_feature
        self.embed_layer_type: EmbedLayerType = embed_layer_type

    @property
    def name(self):
        if self.embed_layer_type == EmbedLayerType.Dense:
            return f"Categorical Feature Embed - Dense"
        elif self.embed_layer_type == EmbedLayerType.Lookup:
            return f"Categorical Feature Embed - Lookup"
        elif self.embed_layer_type == EmbedLayerType.Projection:
            return f"Categorical Feature Embed - Projection"
        raise RuntimeError()

    @property
    def parameters(self):
        return {
            "dimensions_per_feature": self.dimensions_per_feature
        }

    def apply(self, X:List[keras.Input], prefix:str=None):
        if prefix is None:
            prefix = ""

        if self.model_input_specification is None:
            raise Exception("Please call build() before calling apply!")

        numerical_feature_inputs = X[:self.model_input_specification.n_numeric_features]
        categorical_feature_inputs = X[self.model_input_specification.n_numeric_features:]

        #print(len(numerical_feature_inputs), len(categorical_feature_inputs))
        #print(len(self.model_input_specification.categorical_feature_names), self.model_input_specification.categorical_feature_names)

        collected_numeric = Concatenate(name=f"{prefix}concat_numeric")(numerical_feature_inputs)

        collected_categorical = []
        for categorical_field_i, categorical_field_name in enumerate(self.model_input_specification.categorical_feature_names):
            cat_field_x = categorical_feature_inputs[categorical_field_i]
            if self.embed_layer_type != EmbedLayerType.Lookup:
                assert self.model_input_specification.categorical_format == CategoricalFormat.OneHot

                x = Dense(self.dimensions_per_feature,
                          activation="linear",
                          use_bias=(self.embed_layer_type == EmbedLayerType.Dense),
                          name=f"{prefix}embed_{categorical_field_name.replace('/', '')}")(cat_field_x)
                collected_categorical.append(x)

            elif self.embed_layer_type == EmbedLayerType.Lookup:
                assert self.model_input_specification.categorical_format == CategoricalFormat.Integers

                # reshape the sequence to a flat array
                x = cat_field_x
                x = Embedding(input_dim=self.model_input_specification.levels_per_categorical_feature[categorical_field_i] + 1, output_dim=self.dimensions_per_feature, input_length=self.sequence_length)(x)
                x = Reshape((self.sequence_length, self.dimensions_per_feature), name=f"{prefix}expand_{categorical_field_name}")(x)

                collected_categorical.append(x)
        collected_categorical = Concatenate(name=f"{prefix}concat_categorical")(collected_categorical)

        collected = Concatenate()([collected_numeric, collected_categorical])

        return collected

    @property
    def required_input_format(self) -> CategoricalFormat:
        return CategoricalFormat.Integers if self.embed_layer_type == EmbedLayerType.Lookup else CategoricalFormat.OneHot

# Transformer Models

# Decoder Block
# ref: https://github.com/liamdm/FlowTransformer/blob/master/implementations/transformers/basic/decoder_block.py
class TransformerDecoderBlock(Layer):
    def __init__(self, input_dimension:int, inner_dimension:int, num_heads:int, dropout_rate=0.1):
        super().__init__()

        self.num_heads = num_heads
        self.input_dimension = input_dimension
        self.inner_dimension = inner_dimension
        self.dropout_rate = dropout_rate

        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=input_dimension)
        self.dropout1 = Dropout(dropout_rate)
        self.layernorm1 = LayerNormalization(epsilon=1e-6)

        self.ffn = tf.keras.Sequential([
            Dense(inner_dimension, activation='relu'),
            Dense(input_dimension)
        ])
        self.dropout2 = Dropout(dropout_rate)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

    # noinspection PyMethodOverriding
    # SIAN
    def call(self, inputs, training=True, mask=None):
        # inputs = (target_seq, enc_output)
        target_seq = inputs
        enc_output = inputs

        # self attention of target_seq
        attn_output = self.mha(target_seq, target_seq)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = target_seq + attn_output
        out1 = self.layernorm1(out1)

        # multi-head attention with encoder output as the key and value, and target_seq as the query
        attn_output = self.mha(out1, enc_output)
        attn_output = self.dropout2(attn_output, training=training)
        out2 = out1 + attn_output
        out2 = self.layernorm2(out2)

        # feed forward network
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout2(ffn_output, training=training)
        out3 = out2 + ffn_output
        out3 = self.layernorm2(out3)

        return out3

# Encoder Block
# ref: https://github.com/liamdm/FlowTransformer/blob/master/implementations/transformers/basic/encoder_block.py
class GPT3Attention(layers.Layer):
    def __init__(self, n_heads, d_model, dropout_rate=0.1):
        super(GPT3Attention, self).__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.depth = d_model // n_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.n_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    # noinspection PyMethodOverriding
    def call(self, q, k, v, mask=None):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        # Scaled Dot-Product Attention
        scaled_attention_logits = tf.matmul(q, k, transpose_b=True)
        scaled_attention_logits = scaled_attention_logits / tf.math.sqrt(tf.cast(self.depth, tf.float32))

        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        attention_weights = self.dropout(attention_weights)

        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        output = tf.reshape(output, (batch_size, -1, self.d_model))

        output = self.dense(output)
        output = self.dropout(output)

        return output

class MultiHeadAttentionImplementation:
    Keras = 0,
    GPT3 = 1

class TransformerEncoderBlock(layers.Layer):
    def __init__(self, input_dimension:int, inner_dimension:int, num_heads:int, dropout_rate=0.1, use_conv:bool=False, prefix:str=None, attn_implementation:MultiHeadAttentionImplementation = MultiHeadAttentionImplementation.Keras):

        if prefix is None:
            prefix = ""

        super().__init__(name=f"{prefix}transformer_encoder")

        if inner_dimension < input_dimension:
            warnings.warn(f"Typically inner_dimension should be greater than or equal to the input_dimension!")

        self.attn_implementation = attn_implementation

        self.dropout_rate = dropout_rate
        self.attention = \
            layers.MultiHeadAttention(num_heads=num_heads, key_dim=inner_dimension, name=f"{prefix}multi_head_attn") \
                if attn_implementation == MultiHeadAttentionImplementation.Keras else\
                GPT3Attention(num_heads, inner_dimension, dropout_rate=0.0)

        layer_norm = 1e-6

        self.attention_dropout = layers.Dropout(dropout_rate, name=f"{prefix}attention_dropout")
        self.attention_layer_norm = layers.LayerNormalization(epsilon=layer_norm, name=f"{prefix}attention_layer_norm")

        self.feed_forward_0 = Conv1D(filters=inner_dimension, kernel_size=1, activation="relu", name=f"{prefix}feed_forward_0") \
            if use_conv else Dense(inner_dimension, activation="relu", name=f"{prefix}feed_forward_0")
        self.feed_forward_1 = Conv1D(filters=input_dimension, kernel_size=1, activation="relu", name=f"{prefix}feed_forward_1") \
            if use_conv else Dense(input_dimension, activation="relu", name=f"{prefix}feed_forward_1")

        self.feed_forward_dropout = layers.Dropout(dropout_rate, name=f"{prefix}feed_forward_dropout")
        self.feed_forward_layer_norm = layers.LayerNormalization(epsilon=layer_norm, name=f"{prefix}feed_forward_layer_norm")

    # noinspection PyMethodOverriding
    # SIAN
    def call(self, inputs, training=True, mask=None):
        x = inputs
        x = self.attention(x, x) if self.attn_implementation == MultiHeadAttentionImplementation.Keras else self.attention(x, x, x, mask)

        attention_output = self.attention_dropout(x, training=training) if self.dropout_rate > 0 else x

        x = inputs + attention_output
        x = self.attention_layer_norm(x)
        x = self.feed_forward_0(x)
        x = self.feed_forward_1(x)
        x = self.feed_forward_dropout(x, training=training) if self.dropout_rate > 0 else x
        feed_forward_output = x

        return self.feed_forward_layer_norm(attention_output + feed_forward_output)

# Basic Transformers
# ref: https://github.com/liamdm/FlowTransformer/blob/master/implementations/transformers/basic_transformers.py
class BasicTransformer(BaseSequential):

    @property
    def name(self) -> str:
        if self.use_conv:
            return f"Basic Conv Transformer" + (" Decoder" if self.is_decoder else "")
        else:
            return f"Basic Dense Transformer" + (" Decoder" if self.is_decoder else "")

    @property
    def parameters(self) -> dict:
        return {
            "n_layers": self.n_layers,
            "internal_size": self.internal_size,
            "use_conv": self.use_conv,
            "n_heads": self.n_heads,
            "dropout_rate": self.dropout_rate,
            "head_size": self.internal_size
        }

    def __init__(self, n_layers:int, internal_size:int, n_heads:int, use_conv:bool=False, dropout_rate:float=0.1, is_decoder=False):
        super().__init__()
        self.n_layers = n_layers
        self.internal_size = internal_size
        self.use_conv = use_conv
        self.n_heads = n_heads
        self.dropout_rate = dropout_rate
        self.is_decoder = is_decoder

    def apply(self, X, prefix: str = None):
        #window_size = self.sequence_length
        real_size = X.shape[-1]

        m_x = X

        for layer_i in range(self.n_layers):
            if self.is_decoder:
                if self.use_conv:
                    raise NotImplementedError()
                m_x = TransformerDecoderBlock(real_size, self.internal_size, self.n_heads, dropout_rate=self.dropout_rate)(m_x)
            else:
                m_x = TransformerEncoderBlock(real_size, self.internal_size, self.n_heads, dropout_rate=self.dropout_rate, use_conv=self.use_conv, prefix=f"{prefix}block_{layer_i}_")(m_x)

        return m_x

# Named Transformers
# ref: https://github.com/liamdm/FlowTransformer/blob/master/implementations/transformers/named_transformers.py
class GPTSmallTransformer(BaseSequential):

    @property
    def name(self) -> str:
        return "GPT Model"

    @property
    def parameters(self) -> dict:
        return {
            "n_layers": self.n_layers,
            "internal_size": self.internal_size,
            "n_heads": self.n_heads,
            "dropout_rate": self.dropout_rate,
            "head_size": self.head_size
        }

    def __init__(self):
        super().__init__()
        self.n_layers = 12
        self.internal_size = 768
        self.n_heads = 12
        self.head_size = self.internal_size / self.n_heads
        self.dropout_rate = 0.02
        self.is_decoder = True

    def apply(self, X, prefix: str = None):
        #window_size = self.sequence_length
        real_size = X.shape[-1]

        m_x = X

        for layer_i in range(self.n_layers):
            m_x = TransformerDecoderBlock(real_size, self.internal_size, self.n_heads, dropout_rate=self.dropout_rate)(m_x)

        return m_x


class BERTSmallTransformer(BaseSequential):

    @property
    def name(self) -> str:
        return "BERT Model"

    @property
    def parameters(self) -> dict:
        return {
            "n_layers": self.n_layers,
            "internal_size": self.internal_size,
            "n_heads": self.n_heads,
            "dropout_rate": self.dropout_rate,
            "head_size": self.head_size
        }

    def __init__(self):
        super().__init__()
        self.n_layers = 12
        self.internal_size = 768
        self.n_heads = 12
        self.head_size = self.internal_size / self.n_heads
        self.dropout_rate = 0.02
        self.is_decoder = False

    def apply(self, X, prefix: str = None):
        #window_size = self.sequence_length
        real_size = X.shape[-1]

        m_x = X

        for layer_i in range(self.n_layers):
            m_x = TransformerEncoderBlock(real_size, self.internal_size, self.n_heads, dropout_rate=self.dropout_rate, prefix=f"block_{layer_i}_")(m_x)

        return m_x

# Classficiation Head
# ref: https://github.com/liamdm/FlowTransformer/blob/master/implementations/classification_heads.py
class FlattenClassificationHead(BaseClassificationHead):
    def apply(self, X, prefix: str = None):
        if prefix is None:
            prefix = ""
        x = Flatten(name=f"{prefix}flatten")(X)
        return x

    @property
    def name(self) -> str:
        return "Flatten"

    @property
    def parameters(self) -> dict:
        return {}


class FeaturewiseEmbedding(BaseClassificationHead):
    def __init__(self, project:bool=False):
        super().__init__()
        self.project: bool = project

    @property
    def name(self):
        if self.project:
            return f"Featurewise Embed - Projection"
        else:
            return f"Featurewise Embed - Dense"

    @property
    def parameters(self):
        return {}


    def apply(self, X, prefix:str=None):
        if prefix is None:
            prefix = ""

        if self.model_input_specification is None:
            raise Exception("Please call build() before calling apply!")

        x = Dense(1,
                  activation="linear",
                  use_bias=(not self.project),
                  name=f"{prefix}featurewise_embed")(X)

        x = Flatten()(x)

        return x

class GlobalAveragePoolingClassificationHead(BaseClassificationHead):
    def apply(self, X, prefix: str = None):
        if prefix is None:
            prefix = ""
        return GlobalAveragePooling1D(name=f"{prefix}global_avg_pooling_1d")(X)

    @property
    def name(self) -> str:
        return "Global Average Pooling"

    @property
    def parameters(self) -> dict:
        return {}


class LastTokenClassificationHead(BaseClassificationHead):
    def __init__(self):
        super().__init__()

    def apply(self, X, prefix: str = None):
        if prefix is None:
            prefix = ""

        x = Lambda(lambda x: x[..., -1, :], name=f"{prefix}slice_last")(X)
        #x = Flatten(name=f"{prefix}flatten_last")(x)

        return x

    @property
    def name(self) -> str:
        return "Last Token"

    @property
    def parameters(self) -> dict:
        return {}


class CLSTokenClassificationHead(LastTokenClassificationHead):


    @property
    def name(self) -> str:
        return "CLS Token"

    @property
    def parameters(self) -> dict:
        return {}

    def apply_before_transformer(self, X, prefix: str = None):
        if prefix is None:
            prefix = ""

        window_size = self.sequence_length

        x = X
        batch_size = tf.shape(x)[0]
        flow_size = tf.shape(x)[2]

        cls_token_horizontal_single = np.zeros((window_size + 1,))
        cls_token_horizontal_single[-1] = 1.
        cls_token_horizontal_single = tf.convert_to_tensor(cls_token_horizontal_single, dtype=tf.float32)

        cls_token_horizontal = tf.ones((batch_size, window_size + 1,), dtype=tf.float32)
        cls_token_horizontal = tf.multiply(cls_token_horizontal, cls_token_horizontal_single)
        cls_token_horizontal = tf.expand_dims(cls_token_horizontal, axis=-1)

        cls_token_vertical = tf.zeros((batch_size, 1, flow_size,), dtype=tf.float32)

        x = Concatenate(axis=-2, name=f'{prefix}cls_vertical')([x, cls_token_vertical])
        x = Concatenate(axis=-1, name=f'{prefix}cls_horizontal')([x, cls_token_horizontal])

        return x

**Demonstration**

ref:

https://github.com/liamdm/FlowTransformer/blob/master/demonstration.ipynb

https://github.com/liamdm/FlowTransformer/blob/master/FlowTransformer_demo.ipynb

In [None]:
demonstration_folder = "demonstration"

if not os.path.exists(demonstration_folder):
    os.mkdir(demonstration_folder)

from framework.dataset_specification import DatasetSpecification
flow_format = DatasetSpecification(
        include_fields=['NUM_PKTS_UP_TO_128_BYTES', 'SRC_TO_DST_SECOND_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'NUM_PKTS_128_TO_256_BYTES', 'DST_TO_SRC_AVG_THROUGHPUT', 'DURATION_IN', 'L4_SRC_PORT', 'ICMP_TYPE', 'PROTOCOL', 'SERVER_TCP_FLAGS', 'IN_PKTS', 'NUM_PKTS_512_TO_1024_BYTES', 'CLIENT_TCP_FLAGS', 'TCP_WIN_MAX_IN', 'NUM_PKTS_256_TO_512_BYTES', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'LONGEST_FLOW_PKT', 'L4_DST_PORT', 'MIN_TTL', 'DST_TO_SRC_SECOND_BYTES', 'NUM_PKTS_1024_TO_1514_BYTES', 'DURATION_OUT', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'MAX_TTL', 'SRC_TO_DST_AVG_THROUGHPUT', 'ICMP_IPV4_TYPE', 'MAX_IP_PKT_LEN', 'RETRANSMITTED_OUT_BYTES', 'IN_BYTES', 'RETRANSMITTED_IN_BYTES', 'TCP_WIN_MAX_OUT', 'L7_PROTO', 'RETRANSMITTED_OUT_PKTS', 'RETRANSMITTED_IN_PKTS'],
        categorical_fields=['CLIENT_TCP_FLAGS', 'L4_SRC_PORT', 'TCP_FLAGS', 'ICMP_IPV4_TYPE', 'ICMP_TYPE', 'PROTOCOL', 'SERVER_TCP_FLAGS', 'L4_DST_PORT', 'L7_PROTO'],
        class_column="Attack",
        benign_label="Benign"
    )

from framework.flow_transformer_parameters import FlowTransformerParameters
from framework.flow_transformer import FlowTransformer

pre_processing = StandardPreProcessing(n_categorical_levels=32)
encoding = RecordLevelEmbed(64)
transformer = BasicTransformer(n_layers=2, internal_size=128, n_heads=2)
classification_head = LastTokenClassificationHead()

# Define the transformer
ft = FlowTransformer(pre_processing=pre_processing,
                     input_encoding=encoding,
                     sequential_model=transformer,
                     classification_head=classification_head,
                     params=FlowTransformerParameters(window_size=8, mlp_layer_sizes=[128], mlp_dropout=0.1))

from framework.enumerations import EvaluationDatasetSampling
from IPython.display import display

# SIAN
df = ft.load_dataset("UNSW-NB15",
                    data_path+datasets[1],
                    specification=flow_format,
                    evaluation_dataset_sampling=EvaluationDatasetSampling.LastRows,
                    evaluation_percent=0.1,
                    cache_path=demonstration_folder)

display(df.iloc[:500])

Using cache file path: demonstration/UNSW-NB15_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_5EjmvToFWKee8t20u0dFpVzNu4s0.feather
Attempting to read dataset from path /content/drive/MyDrive/FlowTransformer/data/NF-UNSW-NB15-v2/NF-UNSW-NB15-v2.csv...
Set y to = Attack
Converting numerical columns to floats, and removing out of range values...
Applying pre-processing to numerical values
[Numerical 1 / 28] Processing numerical column IN_BYTES...
[Numerical 2 / 28] Processing numerical column DST_TO_SRC_SECOND_BYTES...
[Numerical 3 / 28] Processing numerical column LONGEST_FLOW_PKT...
[Numerical 4 / 28] Processing numerical column FLOW_DURATION_MILLISECONDS...
[Numerical 5 / 28] Processing numerical column TCP_WIN_MAX_IN...
[Numerical 6 / 28] Processing numerical column SHORTEST_FLOW_PKT...
[Numerical 7 / 28] Processing numerical column IN_PKTS...
[Numerical 8 / 28] Processing numerical column NUM_PKTS_1024_TO_1514_BYTES...
[Numerical 9 / 28] Processing numerical column RETRANSMITTED_IN_PKTS...
[Numerica

Unnamed: 0,IN_BYTES,DST_TO_SRC_SECOND_BYTES,LONGEST_FLOW_PKT,FLOW_DURATION_MILLISECONDS,TCP_WIN_MAX_IN,SHORTEST_FLOW_PKT,IN_PKTS,NUM_PKTS_1024_TO_1514_BYTES,RETRANSMITTED_IN_PKTS,SRC_TO_DST_SECOND_BYTES,...,L7_PROTO_23,L7_PROTO_24,L7_PROTO_25,L7_PROTO_26,L7_PROTO_27,L7_PROTO_28,L7_PROTO_29,L7_PROTO_30,L7_PROTO_31,L7_PROTO_32
0,0.127562,0.294070,0.565324,0.0,0.000000,0.440913,0.000000,0.0,0.000000,0.295560,...,False,False,False,False,False,False,False,False,False,False
1,0.323054,0.317800,0.565324,0.0,0.817819,0.440913,0.160324,0.0,0.074172,0.316753,...,False,False,False,False,False,False,False,False,False,False
2,0.358547,0.333895,0.575948,0.0,0.831717,0.440913,0.218877,0.0,0.117560,0.329795,...,False,False,False,False,False,False,False,False,False,False
3,0.380413,0.346818,0.598516,0.0,0.843756,0.440913,0.255508,0.0,0.148344,0.340052,...,False,False,False,False,False,False,False,False,False,False
4,0.402810,0.360391,0.598516,0.0,0.863875,0.440913,0.293311,0.0,0.172222,0.352235,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0.414039,0.367652,0.598516,0.0,0.872469,0.440913,0.312343,0.0,0.191732,0.358927,...,False,False,False,False,False,False,False,False,False,False
496,0.323054,0.317800,0.565324,0.0,0.817819,0.440913,0.160324,0.0,0.074172,0.316753,...,False,False,False,False,False,False,False,False,False,False
497,0.424305,0.374515,0.598516,0.0,0.880314,0.440913,0.328315,0.0,0.208227,0.365353,...,False,False,False,False,False,False,False,False,False,False
498,0.358547,0.333895,0.575948,0.0,0.831717,0.440913,0.218877,0.0,0.117560,0.329795,...,False,False,False,False,False,False,False,False,False,False


In [None]:
# Build the transformer model
m = ft.build_model()
m.summary()

# Compile the model
m.compile(optimizer="adam", loss='binary_crossentropy', metrics=['binary_accuracy'], jit_compile=True)

In [None]:
(train_results, eval_results, final_epoch) = ft.evaluate(m, batch_size=128, epochs=5, steps_per_epoch=64, early_stopping_patience=5)

Building eval dataset...
Splitting dataset to featurewise...
Evaluation dataset is built!
Positive samples in eval set: 16271
Negative samples in eval set: 222756
Epoch = 0 / 5 (early stop in 5), step = 0, loss = 0.72748, results = [array(0.72747767, dtype=float32), array(0.484375, dtype=float32)] -- elapsed (train): 0.00s
Epoch = 0 / 5 (early stop in 5), step = 14, loss = 0.44843, results = [array(0.44842574, dtype=float32), array(0.78333336, dtype=float32)] -- elapsed (train): 1.13s
Epoch = 0 / 5 (early stop in 5), step = 28, loss = 0.30065, results = [array(0.30065006, dtype=float32), array(0.86880386, dtype=float32)] -- elapsed (train): 2.24s
Epoch = 0 / 5 (early stop in 5), step = 36, loss = 0.25780, results = [array(0.25780153, dtype=float32), array(0.8904139, dtype=float32)] -- elapsed (train): 3.27s
Epoch = 0 / 5 (early stop in 5), step = 48, loss = 0.20583, results = [array(0.20582652, dtype=float32), array(0.91470027, dtype=float32)] -- elapsed (train): 4.34s
Epoch = 0 / 5 (e

**Implementation**

Multi Class Classification
ref: https://github.com/liamdm/FlowTransformer/blob/master/framework/flow_transformer.py#L300

SIAN


In [None]:
from framework.base_classification_head import BaseClassificationHead
from framework.base_input_encoding import BaseInputEncoding
from framework.base_preprocessing import BasePreProcessing
from framework.dataset_specification import DatasetSpecification
from framework.enumerations import EvaluationDatasetSampling, CategoricalFormat
from framework.flow_transformer_parameters import FlowTransformerParameters
from framework.framework_component import FunctionalComponent
from framework.model_input_specification import ModelInputSpecification
from framework.utilities import get_identifier, load_feather_plus_metadata, save_feather_plus_metadata

from keras import Input, Model
from keras.layers import Dense, Dropout

import time
from typing import override

# SIAN
class FlowTransformerMultiClass(FlowTransformer):
    def __init__(self, pre_processing:BasePreProcessing,
                 input_encoding:BaseInputEncoding,
                 sequential_model:FunctionalComponent,
                 classification_head:BaseClassificationHead,
                 params:FlowTransformerParameters,
                 rs:np.random.RandomState=None):
        super().__init__(pre_processing, input_encoding, sequential_model, classification_head, params, rs)

    # SIAN
    def multiclass_encoding(self, labels:List):
        if self.y is None:
            raise Exception("Please call load_dataset before calling multiclass_encoding")

        self.output_size = len(labels)
        self.label_encode = {label:index for index, label in enumerate(labels)}
        self.label_decode = {index:label for index, label in enumerate(labels)}
        self.y = np.array([self.label_encode[label] for label in self.y])

    @override
    def build_model(self, prefix:str=None):
        if prefix is None:
            prefix = ""

        if self.X is None:
            raise Exception("Please call load_dataset before calling build_model()")

        m_inputs = []
        for numeric_feature in self.model_input_spec.numeric_feature_names:
            m_input = Input((self.parameters.window_size, 1), name=f"{prefix}input_{numeric_feature}", dtype="float32")
            m_inputs.append(m_input)

        for categorical_feature_name, categorical_feature_levels in \
            zip(self.model_input_spec.categorical_feature_names, self.model_input_spec.levels_per_categorical_feature):
            m_input = Input(
                (self.parameters.window_size, 1 if self.model_input_spec.categorical_format == CategoricalFormat.Integers else categorical_feature_levels),
                name=f"{prefix}input_{categorical_feature_name}",
                dtype="int32" if self.model_input_spec.categorical_format == CategoricalFormat.Integers else "float32"
            )
            m_inputs.append(m_input)

        self.input_encoding.build(self.parameters.window_size, self.model_input_spec)
        self.sequential_model.build(self.parameters.window_size, self.model_input_spec)
        self.classification_head.build(self.parameters.window_size, self.model_input_spec)

        m_x = self.input_encoding.apply(m_inputs, prefix)

        # in case the classification head needs to add tokens at this stage
        m_x = self.classification_head.apply_before_transformer(m_x, prefix)

        m_x = self.sequential_model.apply(m_x, prefix)
        m_x = self.classification_head.apply(m_x, prefix)

        for layer_i, layer_size in enumerate(self.parameters.mlp_layer_sizes):
            m_x = Dense(layer_size, activation="relu", name=f"{prefix}classification_mlp_{layer_i}_{layer_size}")(m_x)
            m_x = Dropout(self.parameters.mlp_dropout)(m_x) if self.parameters.mlp_dropout > 0 else m_x

        # SIAN
        m_x = Dense(self.output_size, activation="softmax", name=f"{prefix}multiclass_classification_out")(m_x)
        m = Model(m_inputs, m_x)
        #m.summary()
        return m

    @override
    def evaluate(self, m:keras.Model, batch_size, early_stopping_patience:int, epochs:int=100, steps_per_epoch:int=128):
        n_malicious_per_batch = int(0.5 * batch_size)
        n_legit_per_batch = batch_size - n_malicious_per_batch

        overall_y_preserve = np.zeros(dtype="float32", shape=(n_malicious_per_batch + n_legit_per_batch,))
        overall_y_preserve[:n_malicious_per_batch] = 1.

        selectable_mask = np.zeros(len(self.X), dtype=bool)
        selectable_mask[self.parameters.window_size:-self.parameters.window_size] = True
        train_mask = self.training_mask

        y_mask = ~(self.y == self.label_encode[str(self.dataset_specification.benign_label)])

        indices_train = np.argwhere(train_mask).reshape(-1)
        malicious_indices_train = np.argwhere(train_mask & y_mask & selectable_mask).reshape(-1)
        legit_indices_train = np.argwhere(train_mask & ~y_mask & selectable_mask).reshape(-1)

        indices_test:np.ndarray = np.argwhere(~train_mask).reshape(-1)

        def get_windows_for_indices(indices:np.ndarray, ordered) -> List[pd.DataFrame]:
            X: List[pd.DataFrame] = []

            if ordered:
                # we don't really want to include eval samples as part of context, because out of range values might be learned
                # by the model, _but_ we are forced to in the windowed approach, if users haven't just selected the
                # "take last 10%" as eval option. We warn them prior to this though.
                for i1 in indices:
                    X.append(self.X.iloc[(i1 - self.parameters.window_size) + 1:i1 + 1])
            else:
                context_indices_batch = np.random.choice(indices_train, size=(batch_size, self.parameters.window_size),
                                                         replace=False).reshape(-1)
                context_indices_batch[:, -1] = indices

                for index in context_indices_batch:
                    X.append(self.X.iloc[index])

            return X

        feature_columns_map = {}

        def samplewise_to_featurewise(X):
            sequence_length = len(X[0])

            combined_df = pd.concat(X)

            featurewise_X = []

            if len(feature_columns_map) == 0:
                for feature in self.model_input_spec.feature_names:
                    if feature in self.model_input_spec.numeric_feature_names or self.model_input_spec.categorical_format == CategoricalFormat.Integers:
                        feature_columns_map[feature] = feature
                    else:
                        # this is a one-hot encoded categorical feature
                        feature_columns_map[feature] = [c for c in X[0].columns if str(c).startswith(feature)]

            for feature in self.model_input_spec.feature_names:
                feature_columns = feature_columns_map[feature]
                combined_values = combined_df[feature_columns].values

                # maybe this can be faster with a reshape but I couldn't get it to work
                combined_values = np.array([combined_values[i:i+sequence_length] for i in range(0, len(combined_values), sequence_length)])
                featurewise_X.append(combined_values)

            return featurewise_X

        print(f"Building eval dataset...")
        eval_X = get_windows_for_indices(indices_test, True)
        print(f"Splitting dataset to featurewise...")
        eval_featurewise_X = samplewise_to_featurewise(eval_X)

        print(f"Evaluation dataset is built!")

        # SIAN
        eval_y = self.y[indices_test]

        print("Evaluation dataset size: %d" %np.count_nonzero(indices_test))
        for key, value in self.label_encode.items():
            print("Label: %s(%d)" %(key, value))
            eval_P = (eval_y == value)
            n_eval_P = np.count_nonzero(eval_P)
            eval_N = ~eval_P
            n_eval_N = np.count_nonzero(eval_N)
            print(f"Positive samples in eval set: {n_eval_P}")
            print(f"Negative samples in eval set: {n_eval_N}")

        epoch_results = []

        def run_evaluation(epoch):
            # SIAN
            pred_y = m.predict(eval_featurewise_X, verbose=True)
            pred_y = pred_y.argmax(axis=1).reshape(-1)

            print(f"Epoch {epoch} yielded predictions: {pred_y.shape}")
            for key, value in self.label_encode.items():
                pred_P = (pred_y == value)
                n_pred_P = np.count_nonzero(pred_P)
                pred_N = ~pred_P
                n_pred_N = np.count_nonzero(pred_N)

                eval_P = (eval_y == value)
                n_eval_P = np.count_nonzero(eval_P)
                eval_N = ~eval_P
                n_eval_N = np.count_nonzero(eval_N)

                TP = np.count_nonzero(pred_P & eval_P)
                FP = np.count_nonzero(pred_P & eval_N)
                TN = np.count_nonzero(pred_N & eval_N)
                FN = np.count_nonzero(pred_N & eval_P)

                sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
                specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
                balanced_accuracy = (sensitivity + specificity) / 2

                precision = TP / (TP + FP) if (TP + FP) > 0 else 0
                recall = TP / (TP + FN) if (TP + FN) > 0 else 0

                f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
                print(f"[{key} -- overall balanced accuracy: {balanced_accuracy * 100:.2f}%, TP = {TP:,} / {n_eval_P:,}, TN = {TN:,} / {n_eval_N:,}")

                epoch_results.append({
                    "epoch": epoch,
                    "label": key,
                    "P": n_eval_P,
                    "N": n_eval_N,
                    "pred_P": n_pred_P,
                    "pred_N": n_pred_N,
                    "TP": TP,
                    "FP": FP,
                    "TN": TN,
                    "FN": FN,
                    "bal_acc": balanced_accuracy,
                    "f1": f1_score
                })

        # SIAN
        y = self.y

        class BatchYielder():
            def __init__(self, ordered, random, rs):
                self.ordered = ordered
                self.random = random
                self.cursor_malicious = 0
                self.cursor_legit = 0
                self.rs = rs

            def get_batch(self):
                malicious_indices_batch = self.rs.choice(malicious_indices_train, size=n_malicious_per_batch,
                                                         replace=False) \
                    if self.random else \
                    malicious_indices_train[self.cursor_malicious:self.cursor_malicious + n_malicious_per_batch]

                legitimate_indices_batch = self.rs.choice(legit_indices_train, size=n_legit_per_batch, replace=False) \
                    if self.random else \
                    legit_indices_train[self.cursor_legit:self.cursor_legit + n_legit_per_batch]

                indices = np.concatenate([malicious_indices_batch, legitimate_indices_batch])

                self.cursor_malicious = self.cursor_malicious + n_malicious_per_batch
                self.cursor_malicious = self.cursor_malicious % (len(malicious_indices_train) - n_malicious_per_batch)

                self.cursor_legit = self.cursor_legit + n_legit_per_batch
                self.cursor_legit = self.cursor_legit % (len(legit_indices_train) - n_legit_per_batch)

                X = get_windows_for_indices(indices, self.ordered)
                # each x in X contains a dataframe, with window_size rows and all the features of the flows. There are batch_size of these.

                # we have a dataframe containing batch_size x (window_size, features)
                # we actually want a result of features x (batch_size, sequence_length, feature_dimension)
                featurewise_X = samplewise_to_featurewise(X)
                # SIAN
                batch_y = y[indices]
                # return featurewise_X, overall_y_preserve
                return featurewise_X, batch_y

        batch_yielder = BatchYielder(self.parameters._train_ensure_flows_are_ordered_within_windows, not self.parameters._train_draw_sequential_windows, self.rs)

        min_loss = 100
        iters_since_loss_decrease = 0

        train_results = []
        final_epoch = 0

        last_print = time.time()
        elapsed_time = 0

        for epoch in range(epochs):
            final_epoch = epoch

            has_reduced_loss = False
            for step in range(steps_per_epoch):
                batch_X, batch_y = batch_yielder.get_batch()

                t0 = time.time()
                batch_results = m.train_on_batch(batch_X, batch_y)
                t1 = time.time()

                if epoch > 0 or step > 0:
                    elapsed_time += (t1 - t0)
                    if epoch == 0 and step == 1:
                        # include time for last "step" that we skipped with step > 0 for epoch == 0
                        elapsed_time *= 2

                train_results.append(batch_results + [elapsed_time, epoch])

                batch_loss = batch_results[0] if isinstance(batch_results, list) else batch_results

                if time.time() - last_print > 3:
                    last_print = time.time()
                    early_stop_phrase = "" if early_stopping_patience <= 0 else f" (early stop in {early_stopping_patience - iters_since_loss_decrease:,})"
                    print(f"Epoch = {epoch:,} / {epochs:,}{early_stop_phrase}, step = {step}, loss = {batch_loss:.5f}, results = {batch_results} -- elapsed (train): {elapsed_time:.2f}s")

                if batch_loss < min_loss:
                    has_reduced_loss = True
                    min_loss = batch_loss

            if has_reduced_loss:
                iters_since_loss_decrease = 0
            else:
                iters_since_loss_decrease += 1

            do_early_stop = early_stopping_patience > 0 and iters_since_loss_decrease > early_stopping_patience
            is_last_epoch = epoch == epochs - 1
            run_eval = epoch in [6] or is_last_epoch or do_early_stop

            if run_eval:
                run_evaluation(epoch)

            if do_early_stop:
                print(f"Early stopping at epoch: {epoch}")
                break

        eval_results = pd.DataFrame(epoch_results)

        return (train_results, eval_results, final_epoch)


In [None]:
implementation_folder = "implementation"

if not os.path.exists(implementation_folder):
    os.mkdir(implementation_folder)

from framework.dataset_specification import DatasetSpecification
flow_format = DatasetSpecification(
        include_fields=['NUM_PKTS_UP_TO_128_BYTES', 'SRC_TO_DST_SECOND_BYTES', 'OUT_PKTS', 'OUT_BYTES', 'NUM_PKTS_128_TO_256_BYTES', 'DST_TO_SRC_AVG_THROUGHPUT', 'DURATION_IN', 'L4_SRC_PORT', 'ICMP_TYPE', 'PROTOCOL', 'SERVER_TCP_FLAGS', 'IN_PKTS', 'NUM_PKTS_512_TO_1024_BYTES', 'CLIENT_TCP_FLAGS', 'TCP_WIN_MAX_IN', 'NUM_PKTS_256_TO_512_BYTES', 'SHORTEST_FLOW_PKT', 'MIN_IP_PKT_LEN', 'LONGEST_FLOW_PKT', 'L4_DST_PORT', 'MIN_TTL', 'DST_TO_SRC_SECOND_BYTES', 'NUM_PKTS_1024_TO_1514_BYTES', 'DURATION_OUT', 'FLOW_DURATION_MILLISECONDS', 'TCP_FLAGS', 'MAX_TTL', 'SRC_TO_DST_AVG_THROUGHPUT', 'ICMP_IPV4_TYPE', 'MAX_IP_PKT_LEN', 'RETRANSMITTED_OUT_BYTES', 'IN_BYTES', 'RETRANSMITTED_IN_BYTES', 'TCP_WIN_MAX_OUT', 'L7_PROTO', 'RETRANSMITTED_OUT_PKTS', 'RETRANSMITTED_IN_PKTS'],
        categorical_fields=['CLIENT_TCP_FLAGS', 'L4_SRC_PORT', 'TCP_FLAGS', 'ICMP_IPV4_TYPE', 'ICMP_TYPE', 'PROTOCOL', 'SERVER_TCP_FLAGS', 'L4_DST_PORT', 'L7_PROTO'],
        class_column="Attack",
        benign_label="Benign"
    )

from framework.flow_transformer_parameters import FlowTransformerParameters
from framework.flow_transformer import FlowTransformer

pre_processing = StandardPreProcessing(n_categorical_levels=32)
encoding = RecordLevelEmbed(64)
transformer = BasicTransformer(n_layers=2, internal_size=128, n_heads=2)
classification_head = LastTokenClassificationHead()

# Define the transformer
# SIAN
ft = FlowTransformerMultiClass(pre_processing=pre_processing,
                     input_encoding=encoding,
                     sequential_model=transformer,
                     classification_head=classification_head,
                     params=FlowTransformerParameters(window_size=8, mlp_layer_sizes=[128], mlp_dropout=0.1))

from framework.enumerations import EvaluationDatasetSampling
from IPython.display import display

# SIAN
df = ft.load_dataset("UNSW-NB15",
                    data_path+datasets[1],
                    specification=flow_format,
                    evaluation_dataset_sampling=EvaluationDatasetSampling.LastRows,
                    evaluation_percent=0.1,
                    cache_path=implementation_folder)

# SIAN
labels = np.unique(ft.y)
ft.multiclass_encoding(labels)

# Build the transformer model
m = ft.build_model()
m.summary()

# Compile the model
m.compile(optimizer="adam", loss='sparse_categorical_crossentropy', metrics=['accuracy'], jit_compile=True)

(train_results, eval_results, final_epoch) = ft.evaluate(m, batch_size=128, epochs=5, steps_per_epoch=64, early_stopping_patience=5)

Using cache file path: implementation/UNSW-NB15_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_5EjmvToFWKee8t20u0dFpVzNu4s0.feather
Reading directly from cache implementation/UNSW-NB15_0_QdLmZHuh8yOmlGcKBEkf7hepImY0_5EjmvToFWKee8t20u0dFpVzNu4s0.feather...


Building eval dataset...
Splitting dataset to featurewise...
Evaluation dataset is built!
Evaluation dataset size: 239027
Label: Analysis(0)
Positive samples in eval set: 348
Negative samples in eval set: 238679
Label: Backdoor(1)
Positive samples in eval set: 375
Negative samples in eval set: 238652
Label: Benign(2)
Positive samples in eval set: 222756
Negative samples in eval set: 16271
Label: DoS(3)
Positive samples in eval set: 983
Negative samples in eval set: 238044
Label: Exploits(4)
Positive samples in eval set: 5630
Negative samples in eval set: 233397
Label: Fuzzers(5)
Positive samples in eval set: 3566
Negative samples in eval set: 235461
Label: Generic(6)
Positive samples in eval set: 2822
Negative samples in eval set: 236205
Label: Reconnaissance(7)
Positive samples in eval set: 2267
Negative samples in eval set: 236760
Label: Shellcode(8)
Positive samples in eval set: 249
Negative samples in eval set: 238778
Label: Worms(9)
Positive samples in eval set: 31
Negative sample