# Prototyping Notebook

This notebook aims to begin prototyping the IDS, implementing the DNN models, FL server and client logic, and incorporate PETs and XAI components.

## General

This section performs data loading, exploring, and preprocessing.

### Setup


In [None]:
# general
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# DL
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# PETs
from opacus import PrivacyEngine

# XAI
import shap
from captum.attr import IntegratedGradients
from captum.attr import LayerConductance
from captum.attr import NeuronConductance

"""
# FL
import flwr
from flwr.client import Client, ClientApp, NumPyClient
from flwr.common import Metrics, Context
from flwr.server import ServerApp, ServerConfig, ServerAppComponents
from flwr.server.strategy import FedAvg
from flwr.simulation import run_simulation

NUM_CLIENTS = 10
BATCH_SIZE = 32
"""

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 5


### Data Preprocessing

- Load data
- Analyse
  - Display shape, nulls, and dtypes
  - Analyse categorical and numerical splits
  - Detect sparsity and constant columns
  - Detect correlations and important features
  - Identify outliers or skewed distributions
  - Anaylse target distributions
- Preprocess
  - Remove redundant columns
  - Remove duplicate values
  - Remove high 0 value columns
  - Encode categorical columns
  - Scale numerical columns
  - Transform to tensor
  - Wrap with Dataloader


#### Helper Functions

In [None]:
def summarise_df(df: pd.DataFrame):
    print(f"Shape: {df.shape}")
    print("\nColumn types:\n", df.dtypes.value_counts())
    print("\nMissing values per column:\n",
          df.isnull().sum()[df.isnull().sum() > 0])
    print(f"\nConstant columns:\n{df.nunique()[df.nunique() <= 1]}\n")
    df.info(verbose=True, show_counts=True, max_cols=None)
    print(df.describe(include='all'))
    
def show_target_distribution(df: pd.DataFrame, target_col='Attack_type'):
    print(df[target_col].value_counts())
    sns.countplot(data=df, y=target_col,
                  order=df[target_col].value_counts().index)
    plt.title(f"Distribution of {target_col}")
    plt.tight_layout()
    plt.show()

    if 'Attack_label' in df.columns:
        sns.countplot(data=df, x='Attack_label')
        plt.title("Binary Attack Label Distribution")
        plt.tight_layout()
        plt.show()
        print(df[target_col].value_counts())
        print(df['Attack_label'].value_counts())

def plot_correlation_heatmap(df: pd.DataFrame, threshold: float = 0.9):
    corr = df.select_dtypes(include='number').corr()
    # Identify highly correlated pairs
    high_corr = ((corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
                  .stack()
                  .reset_index()
                  .rename(columns={0: 'correlation'}))
                 .query('abs(correlation) > @threshold'))
    print("Highly correlated features (>|0.9|):\n", high_corr)

    # plot
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, cmap='coolwarm', center=0)
    plt.title("Feature Correlation Heatmap")
    plt.show()

def visualise_df(df: pd.DataFrame):
    summarise_df(df)
    show_target_distribution(df)
    plot_correlation_heatmap(df)


#### Data Loading & Inspection

In [None]:
# data_path = 'dataset/edge-iiotset/eval/DNN-EdgeIIoT-dataset.csv'
data_path = 'dataset/edge-iiotset/eval/ML-EdgeIIoT-dataset.csv'
# data_path = 'dataset\ciciot2023\MERGED_CSV\Merged01.csv'
df = pd.read_csv(data_path, encoding='utf-8', low_memory=False)

visualise_df(df)


#### Dropping Irrelevant Columns

In [None]:
# remove columns that are not useful
original_drop_columns = [
    "frame.time", 
    "ip.src_host",
    "ip.dst_host", 
    "arp.src.proto_ipv4",
    "arp.dst.proto_ipv4",
    "http.file_data",
    "http.request.full_uri",
    "icmp.transmit_timestamp",
    "http.request.uri.query",
    "tcp.options",
    "tcp.payload",
    "tcp.srcport",
    "tcp.dstport",
    "udp.port",
    "mqtt.msg"
]
additional_drop_columns = [
    "icmp.unused",
    "http.tls_port",
    "dns.qry.type"
]

print(df.shape)
print(f"Dropping {len(original_drop_columns)} columns: ", original_drop_columns)
df.drop(original_drop_columns, axis=1, inplace=True)

print(f"Dropping {len(additional_drop_columns)} columns: ",
      additional_drop_columns)
df.drop(additional_drop_columns, axis=1, inplace=True)
print(df.shape)

print(df.isna().any(axis=1).sum(), "rows with at least one NaN to remove")

df = df.dropna(axis=0, how='any')


#### Sparse Columns

In [None]:
# remove columns with high 0 values
sparse_columns = []
# identify numerical and categorical columns
numerical_col = []
categorical_col = []

def convert_to_float(value):
    if value in [0, 0.0, '0', '0.0', None]:
        return float(0)
    try:
        return float(value)
    except (ValueError, TypeError):
        return None  # or handle as needed

for col in df.columns:
    # apply the conversion function to all columns
    df[col] = df[col].apply(convert_to_float)
    if col != 'Attack_label' and col != 'Attack_type':
        try:
            # calculate sparsity
            zero_ratio = df[col].eq(0).sum() / len(df)
            if zero_ratio >= 0.90:
                sparse_columns.append((col, zero_ratio))
                df.drop(col, axis=1, inplace=True)
            # remove constant columns
            elif df[col].nunique() == 1:
                print("dropping constant column: ", col)
                df.drop(col, axis=1, inplace=True)
            # classify as numerical or categorical for later use
            elif df[col].dtype == object or df[col].nunique() <= 10:
                categorical_col.append(col)
            else:
                numerical_col.append(col)
        except:
            # skip non-numeric or problematic columns
            continue

# Display the results
print("Sparse columns: ", len(sparse_columns))
for col, ratio in sparse_columns:
    print(f"{col}: {ratio:.2%} zeros")

print("New DF Shape: " , df.shape)


In [None]:
for col in categorical_col:
    print(df[col].value_counts())
    

#### Drop Duplicate Rows


In [None]:
print(df.shape)
print(df.duplicated().sum(), "fully duplicate rows to remove")
df.drop_duplicates(subset=None, keep="first", inplace=True)
print(df.shape)


#### Feature Correlation with Label

In [None]:
# display the correlation of all columns with attack_label
df.corrwith(df['Attack_label']).sort_values(ascending=False)


#### Remove Category Column

In [None]:
df.drop(['Attack_type'], axis=1, inplace=True)

features = df.drop('Attack_label', axis=1)
labels = df['Attack_label']


### Tensorise and Wrap with DataLoader

In [None]:
train_size = int(len(df) * 0.8)
test_size = len(df) - train_size
batch_size = 64

# tensorise features and labels
features = torch.tensor(features.values, dtype=torch.float32)
labels = torch.tensor(labels.values, dtype=torch.long)

train_features, inputs, train_labels, test_labels = train_test_split(
    features, labels, test_size=test_size, random_state=42
)

train_dataset = TensorDataset(train_features, train_labels)
test_dataset = TensorDataset(inputs, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


## Phase 1 - Centralised DL


### Model

#### DNN


In [None]:
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        """
        Define model architecture here.\n
        
        Version 1: 03/07
        - Basic Fully Connected Neural Network
        """
        super(DNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.model(x)


#### CNN

In [None]:
class CNN(nn.Module):
    def __init__(self):
        """
        Define model architecture here.
        
        TODO:
        - Adaptive layer sizes based in data shape
        
        Version 2: 07/07
        - Refined boilerplate model
        """
        super(CNN, self).__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            # TODO: calculate correct layer sizes
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            
            nn.Linear(32, 2)
        )

    def forward(self, x):
        x = x.unsqueeze(1) # add channel dimension
        x = self.feature_extractor(x)
        return self.classifier(x)


### Training and Testing Functions


In [None]:
def train(dataloader, model: nn.Module, loss_fn, optimiser):
    size = len(dataloader.dataset)
    print(f"Training model {model.__class__.__name__}")
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            

In [None]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [None]:
dnn = DNN(input_dim=features.shape[1], hidden_dim=128, output_dim=2)
dnn.to(device)

criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(dnn.parameters(), lr=0.001)

for i in range(epochs):
    train(train_loader, dnn, criterion, optimiser)
    test(test_loader, dnn, criterion)


In [None]:
cnn = CNN()
cnn.to(device)

criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(cnn.parameters(), lr=0.001)

for i in range(epochs):
    train(train_loader, cnn, criterion, optimiser)
    test(test_loader, cnn, criterion)


## Phase 2 - PETs

This part would be done on all devices training their local model as it is to be integrated into the model training loop.

### Differential Privacy


In [None]:
# noise multiplier & max grad norm
noise_multiplier = 0.2
max_grad_norm = 1

# DP
privacy_engine = PrivacyEngine()



In [None]:
# init model, optimiser & loss function
model = DNN(input_dim=features.shape[1], hidden_dim=128, output_dim=2)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

dnn_gc, optimiser_gc, criterion_gc, train_loader_gc = privacy_engine.make_private(
    module=model,
    optimizer=optimiser,
    criterion=criterion,
    data_loader=train_loader,
    noise_multiplier=noise_multiplier,
    max_grad_norm=max_grad_norm,
    grad_sample_mode="ghost"
)

# training loop
for i in range(epochs):
    train(train_loader_gc, dnn_gc, criterion_gc, optimiser_gc)
    test(test_loader, dnn_gc, criterion_gc)


In [None]:
# init model, optimiser & loss function
model = CNN()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

cnn_gc, optimiser_gc, criterion_gc, train_loader_gc = privacy_engine.make_private(
    module=model,
    optimizer=optimiser,
    criterion=criterion,
    data_loader=train_loader,
    noise_multiplier=noise_multiplier,
    max_grad_norm=max_grad_norm,
    grad_sample_mode="ghost"
)

# training loop
for i in range(epochs):
    train(train_loader_gc, cnn_gc, criterion_gc, optimiser_gc)
    test(test_loader, cnn_gc, criterion_gc)
