In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Hardware resources

**System Information:** We begin by gathering information about the hardware resources available for our experiment, including CPU, RAM, and GPU details.

In [None]:
import os
from psutil import virtual_memory
from tabulate import tabulate

# Function to get CPU information
def get_cpu_info():
    cpu_info = os.popen('lscpu').read()
    return cpu_info

# Function to get RAM information
def get_ram_info():
    ram = virtual_memory()
    total_ram = f"{ram.total / 1e9:.2f} GB"
    available_ram = f"{ram.available / 1e9:.2f} GB"
    return total_ram, available_ram

# Function to get GPU information
def get_gpu_info():
    # Execute nvidia-smi and get its output
    gpu_info = os.popen('nvidia-smi --query-gpu=name,memory.total,memory.used,memory.free --format=csv,noheader,nounits').read().strip()

    # Split the output to get individual GPU details
    details = gpu_info.split(", ")

    # Return GPU name, total, used, and free memory
    return details[0], f"{details[1]} MB", f"{details[2]} MB", f"{details[3]} MB"

# Collect system information
cpu_info = get_cpu_info()
total_ram, available_ram = get_ram_info()
try:
  gpu_name, gpu_total_memory, gpu_used_memory, gpu_free_memory = get_gpu_info()
except:
  gpu_name, gpu_total_memory, gpu_used_memory, gpu_free_memory = 'null',0,0,0

# Extract relevant CPU information
cpu_type = ""
cpu_architecture = ""

for line in cpu_info.splitlines():
    if "Model name:" in line:
        cpu_type = line.split(":")[1].strip()
    elif "Architecture:" in line:
        cpu_architecture = line.split(":")[1].strip()

# Create a table
table = [
    ["CPU Type", cpu_type],
    ["CPU Architecture", cpu_architecture],
    ["Total RAM", total_ram],
    ["Available RAM", available_ram],
    ["GPU Name", gpu_name],
    ["GPU Total Memory", gpu_total_memory],
    ["GPU Used Memory", gpu_used_memory],
    ["GPU Free Memory", gpu_free_memory]
]

# Display the table
print(tabulate(table, headers=["Characteristic", "Value"], tablefmt="pretty"))


+------------------+--------------------------------+
|  Characteristic  |             Value              |
+------------------+--------------------------------+
|     CPU Type     | Intel(R) Xeon(R) CPU @ 2.20GHz |
| CPU Architecture |             x86_64             |
|    Total RAM     |            54.76 GB            |
|  Available RAM   |            52.92 GB            |
|     GPU Name     |            Tesla T4            |
| GPU Total Memory |            15360 MB            |
| GPU Used Memory  |              0 MB              |
| GPU Free Memory  |            15101 MB            |
+------------------+--------------------------------+


# Introduction

In this experiment, we aim to analyze and process multiple IDS-type datasets related to network traffic and evaluate deep learning models on these datasets. The primary goal is to evaluate the aplicability of DL tehniques on the selected network intrusion datasets. To accomplish this, we will perform the following steps:

1. **Data Loading and Preprocessing:** We load various datasets from different sources, including CSV and Parquet files. Each dataset may have different columns and formats. We unify the column names, handle missing values, and preprocess the data to make it suitable for analysis.

2. **Feature Engineering:** We introduce a new feature called "Efficiency" to the CTU-13 dataset, which encapsulates the notion of data transfer efficiency in network traffic. This feature is calculated based on the ratio of source bytes to total bytes per packet and the total number of packets.

3. **Feature Selection:** For each dataset, we select the top 10 features that are deemed most relevant for analysis. We employ feature selection techniques to ensure that the most informative features are considered in model training.

4. **Model Training and Evaluation:** We train machine learning models, including Feedforward Neural Networks (FFN) and TabNet, on the processed datasets. The goal is to obtain relevant comparable metrics that will help us find the most relevant characteristics of a good dataset. We evaluate the models on the labeled selected feature vectores, using metrics such as F1 score and accuracy.

5. **Results and Comparison:** We save the trained models, F1 scores, and accuracy scores for further analysis. 

6. **Conclusion:**
By following these steps, we aim to gain a better understanding of network traffic datasets and assess the suitability of different models for this domain.

In [None]:
import pandas as pd
import pyarrow.parquet as pq
import dask.dataframe as dd
import os
import glob

dfs = {}

# Define the columns for Kyoto2006+ to reduce memory consumption
columns_to_load = ['Duration', 'Same srv rate', 'Srv serror rate', 'Dst host count',
                   'Dst host srv count', 'Dst host srv serror rate', 'Flag',
                   'IDS detection', 'Source Port Number', 'Protocol', 'Label']


# Unify the label column names
new_column_names = {' Label': 'Label', 'label': 'Label', 'Tag': 'Label'}

file_paths = ['/content/drive/MyDrive/KDD99-BM/kdd99_encoded.csv',
              '/content/drive/MyDrive/NSL-KDD-BM/nsl_kdd_encoded.csv',
              '/content/drive/MyDrive/CTU13-BM/ctu13_scaled.csv',
              '/content/drive/MyDrive/ISCXIDS2012/ISCXIDS2012_encoded.csv',
              '/content/drive/MyDrive/CIC-IDS2017-BM/cicids2017_encoded.csv',
              '/content/drive/MyDrive/CSE-CIC-IDS2018-BM/Encoded/',
              '/content/drive/MyDrive/CIDDS-001-BM/cidds001_encoded.csv',
              '/content/drive/MyDrive/CIDDS-002-BM/cidds002_encoded.csv',
              '/content/drive/MyDrive/Kyoto2006+/Kyoto2015_encoded.parquet']

for item in file_paths:
    file_extension = os.path.splitext(item)[1].lower()
    df_name = os.path.splitext(os.path.basename(item))[0]

    if file_extension == '.csv':
        try:
            # Attempt to read as CSV
            df = dd.read_csv(item)
        except (pd.errors.ParserError, UnicodeDecodeError):
            pass  # Continue to the next file if CSV reading fails

    elif file_extension == '.parquet':
        try:
            # Read as Parquet
            df = dd.read_parquet(item, columns=columns_to_load)

        except dd.errors.ParquetError as e:
            print(f"ParquetError: {e} - Skipping {item}")
            continue  # Skip to the next file if Parquet reading also fails

    elif file_extension == '':
      df_name = 'cse-cic-ids2018_encoded'

      # Create an empty DataFrame to store the concatenated CSE-CIC-IDS2018 CSV data
      df = None
      # Concatenate all CSV files in the CSE-CIC-IDS2018 folder into a single DataFrame
      for csv_file in glob.glob(os.path.join(item, '*.csv')):
        try:
            temp = dd.read_csv(csv_file)
            if df is None:
                df = temp
            else:
                df = dd.concat([df, temp])
        except (pd.errors.ParserError, UnicodeDecodeError):
            print(f"Error reading CSV file: {csv_file}")
            continue

    else:
        print(f"Unsupported file type for {item} - Skipping")
        continue  # Skip unsupported file types

    # Print processed dataframe
    print(f"Read df from {df_name}")

    # Rename the columns
    df = df.rename(columns=new_column_names)

    # Print column names
    #print(df.columns)

    # Add the combined DataFrame to the dictionary
    dfs[df_name] = df


Read df from kdd99_encoded
Read df from nsl_kdd_encoded
Read df from ctu13_scaled
Read df from ISCXIDS2012_encoded
Read df from cicids2017_encoded
Read df from cse-cic-ids2018_encoded
Read df from cidds001_encoded
Read df from cidds002_encoded
Read df from Kyoto2015_encoded


In [None]:
for key in dfs.keys():
    # Get the dataframe corresponding to the key
    df = dfs[key]

    # Compute memory usage (Dask computation)
    memory_usage = df.memory_usage(deep=True).sum().compute()

    # Display memory usage
    print(f"DataFrame Name: {key}")
    print(f"Memory Usage (MB): {memory_usage / (1024 ** 2)}")

    # Display data types
    print("Data Types:")
    print(df.dtypes)
    print()


DataFrame Name: kdd99_encoded
Memory Usage (MB): 1008.7814712524414
Data Types:
duration          float64
src_bytes         float64
dst_bytes         float64
land              float64
wrong_fragment    float64
                   ...   
flag_S1             int64
flag_S2             int64
flag_S3             int64
flag_SF             int64
flag_SH             int64
Length: 123, dtype: object

DataFrame Name: nsl_kdd_encoded
Memory Usage (MB): 119.16677856445312
Data Types:
duration          float64
src_bytes         float64
dst_bytes         float64
land              float64
wrong_fragment    float64
                   ...   
flag_S1             int64
flag_S2             int64
flag_S3             int64
flag_SF             int64
flag_SH             int64
Length: 124, dtype: object

DataFrame Name: ctu13_scaled
Memory Usage (MB): 717.4744873046875
Data Types:
dur          float64
proto        float64
dir          float64
state        float64
stos         float64
dtos         float64
tot_pk

In [None]:
# Print out the DataFrames loaded in the memory
%whos DataFrame

Variable   Type         Data/Info
---------------------------------
df         DataFrame    Dask DataFrame Structure:<...>e: rename, 2 graph layers
temp       DataFrame    Dask DataFrame Structure:<...>: read-csv, 1 graph layer


In [None]:
del df
del temp

Given that the CTU-13 dataset has only 9 features we would like to introduse a feature that encapsulates the notion of ***efficiency*** of the traffic to bring insight into our data.

Efficiency here can be conceptualized as the amount of source data transferred per packet, potentially indicating how effectively the network is being utilized for the given protocol.

This approach is encouraged by the several numerical features representing different aspects of network traffic, such as duration, protocol, direction, state, source and destination tos, total packets, total bytes, and source bytes.

Here are the arguments that stand behide introduction of the "Efficiency" feature:

1. **Relevance**: In network data, understanding how much original data (source bytes) is sent compared to the overall traffic (total bytes) can indicate the efficiency of data transmission, where higher source data with fewer total bytes is generally more desirable.
   
2. **Innovation**: While simple ratios are common, this feature could be further refined by considering the protocol type, as different protocols may have different efficiency expectations.

3. **Mathematical Foundation**: The ratio of source bytes to total bytes per packet could be modeled as an efficiency score. However, this ratio might be influenced by outliers or extreme values, so applying a logarithmic transformation can stabilize variance and reduce the impact of outliers.

4. **Benchmarking**: With an equal number of features across datasets, we can more effectively compare model performance and ensure that any performance differences are not due to variations in the number of input signals (features) the model is receiving.

Mathematical representation of the "Efficiency" feature:

$$
\mathcal{E}_{\text{fficiency}} = \log\left( \max\left( \frac{\text{src\_bytes} + \epsilon}{\text{tot\_bytes} + \epsilon} \times \text{tot\_pkts}, \epsilon \right) \right), where\ \epsilon\ is\ a\ small\ constant\ added\ to\ avoid\ division\ by\ zero\ or\ log\ of\ zero,\ set\ to\ \epsilon=1 \times 10^{-1}
$$


This formula calculates the logarithm of the product of the ratio of source bytes to total bytes and the total number of packets. The logarithm helps to scale down the values and to manage the skewness of data distribution.

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
import pandas as pd
import dask.dataframe as dd

# Define the function to calculate efficiency for each partition of the Dask DataFrame
def calculate_efficiency(tot_bytes, src_bytes, tot_pkts, epsilon=1e-10):
    # Ensure that the argument of the log is always greater than zero
    argument = (src_bytes + epsilon) / (tot_bytes + epsilon) * tot_pkts
    # Clip the argument to be no less than epsilon to avoid log(0)
    argument = np.clip(argument, epsilon, None)
    return np.log(argument)

# Define a function to select the best 10 features for a dataset
def select_best_features(df):
    y = df['Label']
    X = df.drop(columns='Label')
    selector = SelectKBest(score_func=f_classif, k=10)
    X_selected = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support(indices=True)]
    print(f"Selected 10 best features for {df.name}: {selected_features}")
    return pd.DataFrame(X_selected, columns=selected_features), y

# Define the list of datasets and corresponding feature columns based on previous work
best_features = {
    'kdd99_encoded': ['count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate',
                      'dst_host_same_srv_rate', 'dst_host_serror_rate',
                      'dst_host_srv_serror_rate', 'service_private', 'flag_S0', 'flag_SF'],

    'nsl_kdd_encoded':  ['logged_in', 'serror_rate', 'srv_serror_rate', 'same_srv_rate',
                         'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_serror_rate',
                         'dst_host_srv_serror_rate', 'flag_S0', 'flag_SF'],

    'ctu13_scaled':  ['dur', 'proto', 'dir', 'state', 'stos', 'dtos', 'tot_pkts', 'tot_bytes',
                      'src_bytes'],

    'ISCXIDS2012_encoded': ['appName',  'totalDestinationBytes',
                            'totalDestinationPackets', 'totalSourcePackets', 'direction', 'source',
                            'protocolName', 'sourcePort', 'destination', 'destinationPort'],

    'cicids2017_encoded': ['Bwd Packet Length Max', ' Bwd Packet Length Mean',
                          ' Bwd Packet Length Std', ' Fwd IAT Std', ' Max Packet Length',
                           ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance',
                           ' Average Packet Size', ' Avg Bwd Segment Size'],

    'cse-cic-ids2018_encoded': ['Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Fwd Seg Size Avg',
                                'Init Bwd Win Byts', 'Fwd Seg Size Min', 'Protocol', 'Bwd Pkt Len Max',
                                 'Bwd Pkt Len Min', 'Pkt Len Min', 'Dst Port'],

    'cidds001_encoded':  ['duration', 'proto', 'packets', 'bytes', 'tcp_ack', 'tcp_psh',
                           'tcp_rst', 'tcp_syn', 'tcp_fin', 'tos'],

    'cidds002_encoded': ['duration', 'proto', 'packets', 'bytes', 'tcp_ack', 'tcp_psh',
                          'tcp_rst', 'tcp_syn', 'tcp_fin', 'tos'],

     'Kyoto2015_encoded': ['Duration', 'Same srv rate', 'Srv serror rate', 'Dst host count',
                           'Dst host srv count', 'Dst host srv serror rate', 'Flag',
                            'IDS detection', 'Source Port Number', 'Protocol']
    }

# Assuming dfs is a dictionary with dataset names as keys and Dask DataFrames as values
for name, df in dfs.items():
    print(f"Processing DataFrame: '{name}' ")

    if name in best_features:
        if len(best_features[name]) == 10:
          if name == "Kyoto2015_encoded":
            pass
          else:
            best_features[name].append('Label')
            dfs[name] = dd.from_pandas(pd.DataFrame(columns=best_features[name]), npartitions=1)
            dfs[name] = df[best_features[name]]
        elif len(best_features[name]) > 10:
            X, y = select_best_features(df.compute())  # convert to pandas df for feature selection
            dfs[name] = dd.from_pandas(pd.concat([X, y], axis=1), npartitions=df.npartitions)
        else:
            # Calculate the 'Efficiency' feature using a custom function
            df['Efficiency'] = df.map_partitions(
                lambda partition: partition.apply(
                    lambda row: calculate_efficiency(row['tot_bytes'], row['src_bytes'], row['tot_pkts']),
                    axis=1
                ),
                meta=('Efficiency', float)
            )

    else:
        print(f"No feature list found for {name}, skipping...")

    # Compute memory usage (Dask computation)
    memory_usage = df.memory_usage(deep=True).sum().compute()
    print(f"Memory Usage for '{name}' (MB): {memory_usage / (1024 ** 2):.2f}\n")


Processing DataFrame: 'kdd99_encoded' 
Memory Usage for 'kdd99_encoded' (MB): 90.22

Processing DataFrame: 'nsl_kdd_encoded' 
Memory Usage for 'nsl_kdd_encoded' (MB): 10.57

Processing DataFrame: 'ctu13_scaled' 
Memory Usage for 'ctu13_scaled' (MB): 789.22

Processing DataFrame: 'ISCXIDS2012_encoded' 
Memory Usage for 'ISCXIDS2012_encoded' (MB): 136.19

Processing DataFrame: 'cicids2017_encoded' 
Memory Usage for 'cicids2017_encoded' (MB): 211.66

Processing DataFrame: 'cse-cic-ids2018_encoded' 
Memory Usage for 'cse-cic-ids2018_encoded' (MB): 1030.91

Processing DataFrame: 'cidds001_encoded' 
Memory Usage for 'cidds001_encoded' (MB): 365.88

Processing DataFrame: 'cidds002_encoded' 
Memory Usage for 'cidds002_encoded' (MB): 221.79

Processing DataFrame: 'Kyoto2015_encoded' 
Memory Usage for 'Kyoto2015_encoded' (MB): 9699.42



In [None]:
import os

save_folder = '/content/drive/MyDrive/10fDFs'

for key, df in dfs.items():
    save_path = os.path.join(save_folder, f'{key}.parquet')
    df.to_parquet(save_path)


In [2]:
!pip install pytorch-tabnet --quiet
!pip install dask_ml --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.7/148.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import dask.dataframe as dd

# Directory containing Parquet files
directory_path = '/content/drive/MyDrive/10fDFs'

# Dictionary to store Dask DataFrames
dfs = {}

# List all Parquet files in the directory
parquet_files = [filename for filename in os.listdir(directory_path) if filename.endswith('.parquet')]

# Loop through Parquet files and read them into the dictionary
for filename in parquet_files:
    # Extract the name without the extension
    name_without_extension = os.path.splitext(filename)[0]

    # Construct the full path to the Parquet file
    file_path = os.path.join(directory_path, filename)

    # Read the Parquet file into a Dask DataFrame
    ddf = dd.read_parquet(file_path)

    # Store the Dask DataFrame in the dictionary
    dfs[name_without_extension] = ddf

    # Print the columns of the Dask DataFrame
    print(f'Columns of {filename}:')
    print(ddf.columns)
    print('-' * 50)  # Add a separator for clarity


Columns of kdd99_encoded.parquet:
Index(['count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate',
       'dst_host_same_srv_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'service_private', 'flag_S0', 'flag_SF',
       'Label'],
      dtype='object')
--------------------------------------------------
Columns of nsl_kdd_encoded.parquet:
Index(['logged_in', 'serror_rate', 'srv_serror_rate', 'same_srv_rate',
       'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'flag_S0', 'flag_SF', 'Label'],
      dtype='object')
--------------------------------------------------
Columns of ctu13_scaled.parquet:
Index(['dur', 'proto', 'dir', 'state', 'stos', 'dtos', 'tot_pkts', 'tot_bytes',
       'src_bytes', 'Label', 'Efficiency'],
      dtype='object')
--------------------------------------------------
Columns of ISCXIDS2012_encoded.parquet:
Index(['appName', 'totalDestinationBytes', 'totalDestinationPackets',
       'to

In [4]:
import dask.dataframe as dd

# Assuming dfs is your dictionary of datasets and 'Kyoto2015_encoded' is a Dask DataFrame
original_df = dfs['Kyoto2015_encoded']

# Sample 1% of the data
sampled_df = original_df.sample(frac=0.01, random_state=42)

# Replace the original dataframe with the sampled dataframe in the dictionary
dfs['Kyoto2015_encoded'] = sampled_df


In [5]:
import dask.dataframe as dd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from torch.optim import Adam
import os
import pickle
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn.functional as F

# Define the FFN model architecture using PyTorch
class FFN(nn.Module):
    def __init__(self, input_dim):
        super(FFN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

# Define a function to train a model using DataLoader
def train_dl_model(X, y, model, model_name, key, epochs, batch_size, save_path):
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Convert to PyTorch tensors
    X_train_tensor = torch.tensor(X_train.astype(np.float32))
    y_train_tensor = torch.tensor(y_train.astype(np.float32))
    X_test_tensor = torch.tensor(X_test.astype(np.float32))
    y_test_tensor = torch.tensor(y_test.astype(np.float32))

    # Create TensorDatasets and DataLoaders
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor.view(-1, 1))
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor.view(-1, 1))
    train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

    # Set the model to training mode
    model.train()
    optimizer = Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        for batch_idx, (data, targets) in enumerate(train_loader):
            # Forward pass
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(data)

            # Compute loss
            loss = F.binary_cross_entropy(outputs, targets)
            # Backward pass
            loss.backward()
            optimizer.step()

            # Print loss every 100 batches
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Batch [{batch_idx+1}], Loss: {loss.item():.4f}')

    # Set the model to evaluation mode
    model.eval()
    # Predict and evaluate using F1 score and accuracy
    y_pred = get_ffn_predictions(test_loader, model)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred.round())
    print(f"F1 Score for {model_name} on '{key}': {f1:.4f}")
    print(f"Accuracy for {model_name} on '{key}': {accuracy:.4f}")

    # Save the model
    torch.save(model.state_dict(), save_path)

    # Return both F1 and accuracy scores
    return f1, accuracy

# Define a function to get predictions from the model
def get_ffn_predictions(dataloader, model):
    model.eval()  # Set model to evaluation mode
    all_predictions = []
    with torch.no_grad():
        for x, _ in dataloader:
            x = x.to(device)
            predictions = model(x)
            predicted_labels = torch.round(predictions)
            all_predictions.append(predicted_labels.cpu().numpy())
    return np.vstack(all_predictions)

# Setup device for GPU usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device.")

# Define the save path for models and scores
save_dir = '/content/drive/MyDrive/DLModels/model_saves'
os.makedirs(save_dir, exist_ok=True)

# Define the batch_size and epochs
batch_size = 32
epochs = 3

for key, ddf in dfs.items():
    # Correctly compute input_dim, handling Dask objects
    input_dim = ddf.shape[1] - 1 if isinstance(ddf.shape[1], int) else ddf.shape[1].compute() - 1

    ffn_model = FFN(input_dim).to(device)

    memory_usage = ddf.memory_usage(deep=True).sum().compute() / (1024 ** 2)  # in MB
    print(f"Processing DataFrame: '{key}'\nMemory Usage for '{key}' (MB): {memory_usage:.2f}")

    if memory_usage > 500:
        print(f"Dataset '{key}' is larger than 500 MB, will process in batches.")
        # Implement batch processing if needed

    # Convert Dask DataFrame to Pandas DataFrame since PyTorch doesn't work directly with Dask
    df = ddf.compute()
    X = df.drop('Label', axis=1).values
    y = df['Label'].values

    # Train model and get F1 and accuracy scores
    f1_score_value, accuracy_score_value = train_dl_model(
        X, y, ffn_model, 'FFN_Model', key, epochs, batch_size, os.path.join(save_dir, f'{key}_ffn_model.pth')
    )

    # Save the F1 and accuracy scores in a dictionary
    scores = {'f1': f1_score_value, 'accuracy': accuracy_score_value}
    with open(os.path.join(save_dir, f'{key}_scores.pkl'), 'wb') as f:
        pickle.dump(scores, f)

    print(f"Model, F1 score, and accuracy for '{key}' saved successfully.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch [2/3], Batch [3600], Loss: 0.0850
Epoch [2/3], Batch [3700], Loss: 0.0272
Epoch [2/3], Batch [3800], Loss: 0.3228
Epoch [2/3], Batch [3900], Loss: 0.0135
Epoch [2/3], Batch [4000], Loss: 0.1065
Epoch [2/3], Batch [4100], Loss: 0.0704
Epoch [2/3], Batch [4200], Loss: 0.0484
Epoch [2/3], Batch [4300], Loss: 0.0787
Epoch [2/3], Batch [4400], Loss: 0.0680
Epoch [2/3], Batch [4500], Loss: 0.0934
Epoch [2/3], Batch [4600], Loss: 0.0426
Epoch [2/3], Batch [4700], Loss: 0.0807
Epoch [2/3], Batch [4800], Loss: 0.0952
Epoch [2/3], Batch [4900], Loss: 0.0524
Epoch [2/3], Batch [5000], Loss: 0.0681
Epoch [2/3], Batch [5100], Loss: 0.0484
Epoch [2/3], Batch [5200], Loss: 0.0363
Epoch [2/3], Batch [5300], Loss: 0.0224
Epoch [2/3], Batch [5400], Loss: 0.0602
Epoch [2/3], Batch [5500], Loss: 0.0550
Epoch [2/3], Batch [5600], Loss: 0.0331
Epoch [2/3], Batch [5700], Loss: 0.0999
Epoch [2/3], Batch [5800], Loss: 0.1352
Epoch [2/3], Ba

In [6]:
import os
import pickle
import torch
import warnings
from dask_ml.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetClassifier
import dask.array as da
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Suppress warnings
warnings.filterwarnings('ignore')

# Define the save path for models, F1 scores, and checkpoints
save_dir = '/content/drive/MyDrive/DLModels/model_saves'
checkpoint_dir = '/content/drive/MyDrive/DLModels/checkpoints'
os.makedirs(save_dir, exist_ok=True)
os.makedirs(checkpoint_dir, exist_ok=True)

# Parameters
BATCH_SIZE = 15358  # Batch size for training
EPOCHS = 3
TARGET = 'Label'

# Function to train TabNet model using Dask dataframe
def train_tabnet_model(ddf, key, save_dir, checkpoint_dir):
    # Check if a checkpoint exists
    checkpoint_path = os.path.join(checkpoint_dir, f'{key}_tabnet_checkpoint.zip')
    if os.path.exists(checkpoint_path):
        print(f'Loading checkpoint for dataset "{key}"')
        tabnet_model = TabNetClassifier()
        tabnet_model.load_model(checkpoint_path)
    else:
        tabnet_model = TabNetClassifier(optimizer_params=dict(lr=2e-2),
                                        scheduler_params={"step_size":10, "gamma":0.9},
                                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                                        mask_type='entmax',
                                        device_name='cuda')

    # Convert the Dask DataFrame to Dask Arrays
    X_ddf = ddf[ddf.columns.difference([TARGET])]
    y_ddf = ddf[TARGET]
    X = X_ddf.to_dask_array(lengths=True)
    y = y_ddf.to_dask_array(lengths=True)

    # Split the Dask arrays into training and validation sets
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

    # Compute the validation set in advance
    X_valid_computed, y_valid_computed = X_valid.compute(), y_valid.compute()

    # Initialize a list to store accuracy scores
    accuracy_scores = []

    # Training loop
    for epoch in range(EPOCHS):
        # Shuffle indices for the training data
        shuffled_indices = da.random.permutation(y_train.shape[0])

        # Iterate over batches
        for i in range(0, len(y_train), BATCH_SIZE):
            # Calculate batch indices
            indices = shuffled_indices[i:i + BATCH_SIZE].compute()
            X_batch = X_train[indices].compute()
            y_batch = y_train[indices].compute()

            # Train the model on this batch
            tabnet_model.fit(X_batch, y_batch, max_epochs=1, eval_metric=['auc'])

        # Predict and calculate the accuracy score at the end of each epoch
        y_pred = tabnet_model.predict(X_valid_computed)
        score = accuracy_score(y_valid_computed, y_pred)
        accuracy_scores.append(score)  # Append accuracy score to the list
        print(f'Epoch {epoch+1}/{EPOCHS}, Accuracy: {score:.4f}')

        # Save a checkpoint of the model
        tabnet_model.save_model(checkpoint_path)

    # Save the final model
    save_path = os.path.join(save_dir, f'{key}_tabnet_model.zip')
    tabnet_model.save_model(save_path)

    # Calculate and save the F1 score
    f1 = f1_score(y_valid_computed, y_pred, average='binary')
    print(f'F1 Score for TabNet on dataset "{key}": {f1:.4f}')

    # Save the F1 score and accuracy scores
    metrics_save_path = os.path.join(save_dir, f'{key}_tabnet_metrics.pkl')
    with open(metrics_save_path, 'wb') as f:
        pickle.dump({'f1': f1, 'accuracy': accuracy_scores}, f)

# Assuming 'dfs' is a dictionary of datasets, process each dataset
for key, ddf in dfs.items():
    print(f'Processing dataset "{key}"')
    if key == "Kyoto2015_encoded":
      train_tabnet_model(ddf, key, save_dir, checkpoint_dir)

print("All models have been trained and saved.")


Processing dataset "kdd99_encoded"
Processing dataset "nsl_kdd_encoded"
Processing dataset "ctu13_scaled"
Processing dataset "ISCXIDS2012_encoded"
Processing dataset "cicids2017_encoded"
Processing dataset "cse-cic-ids2018_encoded"
Processing dataset "cidds001_encoded"
Processing dataset "cidds002_encoded"
Processing dataset "Kyoto2015_encoded"
epoch 0  | loss: 0.31808 |  0:00:00s
epoch 0  | loss: 0.31568 |  0:00:00s
epoch 0  | loss: 0.31159 |  0:00:00s
epoch 0  | loss: 0.3198  |  0:00:00s
epoch 0  | loss: 0.3161  |  0:00:00s
epoch 0  | loss: 0.30738 |  0:00:00s
epoch 0  | loss: 0.32724 |  0:00:00s
epoch 0  | loss: 0.31377 |  0:00:00s
epoch 0  | loss: 0.31232 |  0:00:00s
epoch 0  | loss: 0.31164 |  0:00:00s
epoch 0  | loss: 0.31352 |  0:00:00s
epoch 0  | loss: 0.31874 |  0:00:00s
epoch 0  | loss: 0.31144 |  0:00:00s
epoch 0  | loss: 0.3084  |  0:00:00s
epoch 0  | loss: 0.3168  |  0:00:00s
epoch 0  | loss: 0.30442 |  0:00:00s
epoch 0  | loss: 0.31542 |  0:00:00s
epoch 0  | loss: 0.32629

In [7]:
import os
import pickle
import pandas as pd

# Directory where the .pkl files are saved
results_dir = '/content/drive/MyDrive/DLModels/model_saves'

# Function to extract model name from the file name
def get_model_name(file_name):
    if 'tabnet' in file_name:
        return 'TabNet'
    else:
        return 'FFN'

# List all .pkl files in the directory that are pickle files
score_pkl_files = [f for f in os.listdir(results_dir) if f.endswith('.pkl')]

# Load the results from each .pkl file and create a combined list
results = []
for file_name in score_pkl_files:
    with open(os.path.join(results_dir, file_name), 'rb') as file:
        scores = pickle.load(file)
        dataset_name = file_name.split('_')[0]  # Assuming the dataset name is at the start of the file name
        model_name = get_model_name(file_name)

        try:
          # Extract F1 score and accuracy, handle the case where accuracy is a list
          f1_score = scores['f1']
          accuracy = max(scores['accuracy']) if isinstance(scores['accuracy'], list) else scores['accuracy']

          results.append((model_name, dataset_name, 'F1 Score', f1_score))
          results.append((model_name, dataset_name, 'Accuracy', accuracy))
        except:
          pass

# Create a DataFrame
results_df = pd.DataFrame(results, columns=['Model', 'Dataset', 'Metric', 'Value'])

# Pivot the DataFrame to have one row per dataset and model, and columns for each metric
results_df_pivoted = results_df.pivot_table(index=['Model', 'Dataset'], columns='Metric', values='Value', aggfunc='first').reset_index()

# Sort by Model and Dataset for readability
results_df_pivoted.sort_values(by=['Model', 'Dataset'], inplace=True)

# Reset index after sorting
results_df_pivoted.reset_index(drop=True, inplace=True)

# Display the pivoted DataFrame
print(results_df_pivoted)


Metric   Model          Dataset  Accuracy  F1 Score
0          FFN      ISCXIDS2012  0.991841  0.882332
1          FFN        Kyoto2015  0.958650  0.978498
2          FFN       cicids2017  0.964763  0.887688
3          FFN         cidds001  0.960476  0.979647
4          FFN         cidds002  0.999669  0.999834
5          FFN  cse-cic-ids2018  0.971457  0.871721
6          FFN            ctu13  0.976128  0.084161
7          FFN            kdd99  0.992181  0.983696
8          FFN              nsl  0.939229  0.934654
9       TabNet      ISCXIDS2012  0.967161  0.000000
10      TabNet        Kyoto2015  0.942747  0.970479
11      TabNet       cicids2017  0.830788  0.000094
12      TabNet         cidds001  0.953648  0.976219
13      TabNet         cidds002  0.999128  0.999564
14      TabNet  cse-cic-ids2018  0.889401  0.000000
15      TabNet            ctu13  0.975295  0.000000
16      TabNet            kdd99  0.944228  0.291046
17      TabNet              nsl  0.819752  0.761540
