# Prototyping Notebook

This notebook aims to begin prototyping the IDS, implementing the DNN models, FL server and client logic, and incorporate PETs and XAI components.

## General

This section performs data loading, exploring, and preprocessing.

### Imports


In [None]:
# general
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# DL
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# PETs
from opacus import PrivacyEngine

# XAI
import shap
from captum.attr import IntegratedGradients
from captum.attr import LayerConductance
from captum.attr import NeuronConductance

"""
# FL
import flwr
from flwr.client import Client, ClientApp, NumPyClient
from flwr.common import Metrics, Context
from flwr.server import ServerApp, ServerConfig, ServerAppComponents
from flwr.server.strategy import FedAvg
from flwr.simulation import run_simulation

NUM_CLIENTS = 10
BATCH_SIZE = 64
"""

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
epochs = 5


### Helper Functions

In [None]:
# remove columns that are not useful
def custom_drop_cols(df: pd.DataFrame, drop_cols: list):
    print(df.shape)
    print(f"Dropping {len(drop_cols)} columns: ", drop_cols)
    df.drop(drop_cols, axis=1, inplace=True)

    return df


#### EDA


In [None]:
def summarise_df(df: pd.DataFrame):
    print(f"Shape: {df.shape}")
    print("\nColumn types:\n", df.dtypes.value_counts())
    print("\nMissing values per column:\n",
          df.isnull().sum()[df.isnull().sum() > 0])
    print(f"\nConstant columns:\n{df.nunique()[df.nunique() <= 1]}\n")
    df.info(verbose=True, show_counts=True, max_cols=None)
    print(df.describe(include='all'))


In [None]:
def show_target_distribution(df: pd.DataFrame, target_col='Attack_type'):
    print(df[target_col].value_counts())
    sns.countplot(data=df, y=target_col,
                  order=df[target_col].value_counts().index)
    plt.title(f"Distribution of {target_col}")
    plt.tight_layout()
    plt.show()

    if 'Attack_label' in df.columns:
        sns.countplot(data=df, x='Attack_label')
        plt.title("Binary Attack Label Distribution")
        plt.tight_layout()
        plt.show()
        print(df[target_col].value_counts())
        print(df['Attack_label'].value_counts())



In [None]:
def plot_correlation_heatmap(df: pd.DataFrame, threshold: float = 0.9):
    corr = df.select_dtypes(include='number').corr()
    # Identify highly correlated pairs
    high_corr = ((corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
                  .stack()
                  .reset_index()
                  .rename(columns={0: 'correlation'}))
                 .query('abs(correlation) > @threshold'))
    print("Highly correlated features (>|0.9|):\n", high_corr)

    # plot
    plt.figure(figsize=(12, 10))
    sns.heatmap(corr, cmap='coolwarm', center=0)
    plt.title("Feature Correlation Heatmap")
    plt.show()


#### Training and Testing Functions

In [None]:
def train(dataloader, model: nn.Module, loss_fn, optimiser):
    size = len(dataloader.dataset)
    print(f"Training model {model.__class__.__name__}")
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(
        f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


### Data Preprocessing

- Load data
- Analyse
  - Display shape, nulls, and dtypes
  - Analyse categorical and numerical splits
  - Detect sparsity and constant columns
  - Detect correlations and important features
  - Identify outliers or skewed distributions
  - Anaylse target distributions
- Preprocess
  - Remove redundant columns
  - Remove duplicate values
  - Remove high 0 value columns
  - Encode categorical columns
  - Scale numerical columns
  - Transform to tensor
  - Wrap with Dataloader


#### Loading & Inspection


In [None]:
# edge-iiot
# data_path = 'dataset/edge-iiotset/eval/DNN-EdgeIIoT-dataset.csv'
data_path = 'dataset/edge-iiotset/eval/ML-EdgeIIoT-dataset.csv'

# ciciot
# data_path = 'dataset\ciciot2023\MERGED_CSV\Merged01.csv'
df = pd.read_csv(data_path, encoding='utf-8', low_memory=False)

summarise_df(df)
plot_correlation_heatmap(df)


#### Dropping Irrelevant Columns

In [None]:
original_drop_columns = [
    "frame.time",
    "ip.src_host",
    "ip.dst_host",
    "arp.src.proto_ipv4",
    "arp.dst.proto_ipv4",
    "http.file_data",
    "http.request.full_uri",
    "icmp.transmit_timestamp",
    "http.request.uri.query",
    "tcp.options",
    "tcp.payload",
    "tcp.srcport",
    "tcp.dstport",
    "udp.port",
    "mqtt.msg"
]

# new version of cols to drop
safe_to_drop_cols = [
    "frame.time",
    "ip.src_host",
    "ip.dst_host",
    "arp.src.proto_ipv4",
    "arp.dst.proto_ipv4",
    "http.file_data",
    "http.request.full_uri",
    "icmp.transmit_timestamp",
    "tcp.options",
    "tcp.payload",
    "mqtt.msg",
    "icmp.unused",
    "mqtt.msg_decoded_as",
    "Attack_type"
]

df = custom_drop_cols(df, safe_to_drop_cols)


#### Sparse Columns

In [None]:
# remove columns with high 0 values
sparse_columns = []
# identify numerical and categorical columns
numerical_features = []
categorical_features = []

def convert_to_float(value):
    if value in [0, 0.0, '0', '0.0']:
        return float(0)
    try:
        return float(value)
    except (ValueError, TypeError):
        return value  # or handle as needed

for col in df.columns:
    # apply the conversion function to all columns
    df[col] = df[col].apply(convert_to_float)
    if col != 'Attack_label':
        try:
            # calculate sparsity
            zero_ratio = df[col].eq(0).sum() / len(df)
            if zero_ratio >= 0.95:
                sparse_columns.append((col, zero_ratio))
                df.drop(col, axis=1, inplace=True)
            elif df[col].nunique() == 1:
                print("dropping constant column: ", col)
                # df.drop(col, axis=1, inplace=True)
            # classify as numerical or categorical for later use
            elif df[col].dtype == object or df[col].nunique() <= 10:
                categorical_features.append(col)
            # else:
            #     numerical_features.append(col)
        except:
            # skip non-numeric or problematic columns
            continue

# Display the results
print("Sparse columns: ", len(sparse_columns))
for col, ratio in sparse_columns:
    print(f"{col}: {ratio:.2%} zeros")

print("New DF Shape: " , df.shape)


In [None]:
print("Categorical features\tNumber of unique values")
for col in categorical_features:
    print(col, "\t", df[col].nunique())
    

#### Remove Category Column

In [None]:
features = df.drop('Attack_label', axis=1)
labels = df['Attack_label']


### Split into Training and Testing


In [None]:
# train and test set
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)


### Encode Data


In [None]:
# import encoder
import category_encoders as ce

X_train[categorical_features] = X_train[categorical_features].astype(str)
X_test[categorical_features] = X_test[categorical_features].astype(str)

# low_cardinality_features = ['tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn','tcp.flags', 'tcp.flags.ack']
low_cardinality_features = [col for col in categorical_features if X_train[col].nunique() <= 10]
print("Low cardinality features:\n",low_cardinality_features)

encoder = ce.OneHotEncoder()
train_encoded = encoder.fit_transform(X_train[low_cardinality_features])
test_encoded = encoder.transform(X_test[low_cardinality_features])

X_train.drop(low_cardinality_features, axis=1, inplace=True)
X_test.drop(low_cardinality_features, axis=1, inplace=True)

X_train = pd.concat([X_train, train_encoded], axis=1)
X_test = pd.concat([X_test, test_encoded], axis=1)

encoder = ce.CountEncoder()
X_train['tcp.srcport'] = encoder.fit_transform(X_train['tcp.srcport'])
X_test['tcp.srcport'] = encoder.transform(X_test['tcp.srcport'])


In [None]:
# examine new data
print("New DF Shape: " , X_train.shape)
print("New DF Shape: " , X_test.shape)

print("Encoded dataset:")
print(X_train.columns)
print("Data types of X_train:\n", X_train.dtypes)


In [None]:
numerical_features = [col for col in X_train.columns if X_train[col].dtype in ['float64', 'int64']]
print("Numerical features:\n", numerical_features)

assert len(numerical_features) > 0


### Scale Data


In [None]:
# import minmax scaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=numerical_features)
# X_test = pd.DataFrame(scaler.transform(X_test), columns=numerical_features)

X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])


### Tensorise and Wrap with DataLoader

In [None]:
train_size = int(len(df) * 0.8)
test_size = len(df) - train_size
batch_size = 64

print(f"Data types: {X_train.dtypes}")

# tensorise data
train_features = torch.tensor(X_train.values, dtype=torch.float32)
train_labels = torch.tensor(y_train.values, dtype=torch.long)

test_features = torch.tensor(X_test.values, dtype=torch.float32)
test_labels = torch.tensor(y_test.values, dtype=torch.long)

train_dataset = TensorDataset(train_features, train_labels)
test_dataset = TensorDataset(test_features, test_labels)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


## Phase 1 - Centralised DL


### Model

#### DNN


In [None]:
class DNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=2):
        """
        Define model architecture here.\n
        
        Version 1: 03/07
        - Basic Fully Connected Neural Network
        """
        super(DNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.model(x)


#### CNN

In [None]:
class CNN(nn.Module):
    def __init__(self):
        """
        Define model architecture here.
        
        TODO:
        - Adaptive layer sizes based in data shape
        
        Version 2: 07/07
        - Refined boilerplate model
        """
        super(CNN, self).__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2),
            
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            # TODO: calculate correct layer sizes
            nn.LazyLinear(64),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(p=0.15),
            
            nn.Linear(32, 2)
        )

    def forward(self, x):
        x = x.unsqueeze(1) # add channel dimension
        x = self.feature_extractor(x)
        return self.classifier(x)


In [None]:
dnn = DNN(input_dim=X_train.shape[1], hidden_dim=128, output_dim=2)
dnn.to(device)

criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(dnn.parameters(), lr=0.001)

for i in range(epochs):
    train(train_loader, dnn, criterion, optimiser)
    test(test_loader, dnn, criterion)


In [None]:
cnn = CNN()
cnn.to(device)

criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(cnn.parameters(), lr=0.001)

for i in range(epochs):
    train(train_loader, cnn, criterion, optimiser)
    test(test_loader, cnn, criterion)


### XAI for Default Models


In [None]:
# do the xai stuff for dnn

dnn.eval()

background = train_features[:100]
background = background.to(device)

explainer = shap.GradientExplainer(dnn, background)

test_data = train_features[100:200]
test_data = test_data.to(device)

shap_values = explainer.shap_values(test_data)

test_data = test_data.cpu().numpy()

shap.summary_plot(shap_values, test_data, class_names=["Benign", "Malicious"], feature_names=X_train.columns)


In [None]:
print("Feature correlation:")
print(X_train.corr())
print("\nFeature statistics:")
print(X_train.describe())


## Phase 2 - PETs

This part would be done on all devices training their local model as it is to be integrated into the model training loop.

### Differential Privacy


In [None]:
# noise multiplier & max grad norm
noise_multiplier = 0.2
max_grad_norm = 1

# DP
privacy_engine = PrivacyEngine()



In [None]:
# init model, optimiser & loss function
model = DNN(input_dim=X_train.shape[1], hidden_dim=128, output_dim=2)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

dnn_gc, optimiser_gc, criterion_gc, train_loader_gc = privacy_engine.make_private(
    module=model,
    optimizer=optimiser,
    criterion=criterion,
    data_loader=train_loader,
    noise_multiplier=noise_multiplier,
    max_grad_norm=max_grad_norm,
    grad_sample_mode="ghost"
)

# training loop
for i in range(epochs):
    train(train_loader_gc, dnn_gc, criterion_gc, optimiser_gc)
    test(test_loader, dnn_gc, criterion_gc)


In [None]:
# init model, optimiser & loss function
model = CNN()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.001)

cnn_gc, optimiser_gc, criterion_gc, train_loader_gc = privacy_engine.make_private(
    module=model,
    optimizer=optimiser,
    criterion=criterion,
    data_loader=train_loader,
    noise_multiplier=noise_multiplier,
    max_grad_norm=max_grad_norm,
    grad_sample_mode="ghost"
)

# training loop
for i in range(epochs):
    train(train_loader_gc, cnn_gc, criterion_gc, optimiser_gc)
    test(test_loader, cnn_gc, criterion_gc)


## Phase 3 - XAI

This phase would be completed on the central server to evaluate the explainability of the global model.


### Create Non-DP Model Copy

Opacus DP may be incompatible with SHAP explainers.


In [None]:
def make_model_copy(new_model, old_model):
    new_model.load_state_dict(old_model._module.state_dict())
    new_model.eval()
    new_model.to(device)
    return new_model


In [None]:
# create model copy using dp weights
new_model = make_model_copy(DNN(X_train.shape[1], hidden_dim=128, output_dim=2), dnn_gc)
# new_model = make_model_copy(CNN(), cnn_gc)

# create background dataset
background = train_features[:100]
background = background.to(device)


### SHAP


In [None]:
# create SHAP explainer
explainer = shap.GradientExplainer(new_model,background)

# calculate shap_values with test_features
shap_values = explainer.shap_values(test_features[:100].to(device))

# calculate shap summary plot
shap.summary_plot(shap_values, test_features[:100].to(device), class_names=["Benign", "Malicious"], feature_names=X_train.columns)


### Captum


In [None]:
# use IntegratedGradients to analyse model
ig = IntegratedGradients(new_model)

# compute ig values
attr, delta = ig.attribute(test_features[:100].to(device), baselines=background, target=0, return_convergence_delta=True)


In [None]:
# Helper method to print importances and visualize distribution
def visualize_importances(feature_names, importances, title="Average Feature Importances", plot=True, axis_title="Features"):
    print(title)
    for i in range(len(feature_names)):
        print(feature_names[i], ": ", '%.3f' % (importances[i]))
    x_pos = (np.arange(len(feature_names)))
    if plot:
        plt.figure(figsize=(12, 6))
        plt.bar(x_pos, importances, align='center')
        plt.xticks(x_pos, feature_names, wrap=True)
        plt.xlabel(axis_title)
        plt.title(title)


visualize_importances(features.columns, np.mean(attr, axis=0))
