In [1]:
import numpy as np
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

import sys
sys.path.append("..")

# Load data

In [2]:
file_path = "../data/DNN-EdgeIIoT-dataset.csv"

In [3]:
df = pd.read_csv(file_path, low_memory=False)

In [4]:
df.shape

(2219201, 63)

In [5]:
df.head()

Unnamed: 0,frame.time,ip.src_host,ip.dst_host,arp.dst.proto_ipv4,arp.opcode,arp.hw.size,arp.src.proto_ipv4,icmp.checksum,icmp.seq_le,icmp.transmit_timestamp,...,mqtt.proto_len,mqtt.protoname,mqtt.topic,mqtt.topic_len,mqtt.ver,mbtcp.len,mbtcp.trans_id,mbtcp.unit_id,Attack_label,Attack_type
0,2021 11:44:10.081753000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
1,2021 11:44:10.162218000,192.168.0.101,192.168.0.128,0,0.0,0.0,0,0.0,0.0,0.0,...,4.0,MQTT,0,0.0,4.0,0.0,0.0,0.0,0,Normal
2,2021 11:44:10.162271000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
3,2021 11:44:10.162641000,192.168.0.128,192.168.0.101,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,Normal
4,2021 11:44:10.166132000,192.168.0.101,192.168.0.128,0,0.0,0.0,0,0.0,0.0,0.0,...,0.0,0,Temperature_and_Humidity,24.0,0.0,0.0,0.0,0.0,0,Normal


# Data exploration and processing

In [6]:
# Check for missing values
df.isna().sum()

frame.time            0
ip.src_host           0
ip.dst_host           0
arp.dst.proto_ipv4    0
arp.opcode            0
                     ..
mbtcp.len             0
mbtcp.trans_id        0
mbtcp.unit_id         0
Attack_label          0
Attack_type           0
Length: 63, dtype: int64

In [7]:
# Check for duplicate rows
df.duplicated().sum()

np.int64(815)

In [None]:
# Drop duplicate rows
df.drop_duplicates(subset=None, keep="first", inplace=True)
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
# Check the cardinality of features
df.nunique()

In [None]:
# Drop columns with only one unique value
cols_one_unique_val = df.nunique()[df.nunique() == 1].index.tolist()
df.drop(columns=cols_one_unique_val, inplace=True)
df.shape

In [None]:
df.columns

In [None]:
# Drop timestamps; drop IP addresses and ports for better generalization
df.drop(
    columns=[
        "frame.time", "icmp.transmit_timestamp", "ip.src_host", "ip.dst_host",
        "arp.src.proto_ipv4", "arp.dst.proto_ipv4", "tcp.srcport", "tcp.dstport",
        "udp.port"
    ],
    inplace=True
)
df.shape

In [None]:
# Additional features to drop: not useful for classification or way too complex
# (e.g., large text fields, complex structures, noise)
df = df.drop(
    columns=[
        "http.file_data", "http.request.uri.query", "http.request.full_uri",
        "tcp.payload", "tcp.options", "mqtt.msg"
    ]
)
df.shape

In [None]:
df.head()

In [None]:
# Check data types of features
df.dtypes

# Check class distribution

In [None]:
vc_labels = df["Attack_label"].value_counts()
plt.figure(figsize=(15, 5))
sns.barplot(
    x=vc_labels.index,
    y=vc_labels.values,
    hue=vc_labels.index, palette="Set2",
    edgecolor="black"
)
for label in vc_labels.index:
    plt.text(
        x=label, 
        y=vc_labels[label] + 5,
        s=str(vc_labels[label]),
        ha='center', va='bottom'
    )
plt.title("Distribution of Attack Labels")
plt.xlabel("Attack Label")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
vc_attack_types = df["Attack_type"].value_counts()
plt.figure(figsize=(15, 5))
sns.barplot(
    x=vc_attack_types.index,
    y=vc_attack_types.values,
    hue=vc_attack_types.index, palette="Set2",
    edgecolor="black"
)
for label in vc_attack_types.index:
    plt.text(
        x=label, 
        y=vc_attack_types[label] + 5,
        s=str(vc_attack_types[label]),
        ha='center', va='bottom'
    )
plt.title("Distribution of Attack Types")
plt.xlabel("Attack Type")
plt.ylabel("Count")
plt.xticks(rotation=90)
plt.show()

In [None]:
vc_attack_types_normalized = df["Attack_type"].value_counts(normalize=True)
plt.figure(figsize=(15, 5))
sns.barplot(
    x=vc_attack_types_normalized.index,
    y=vc_attack_types_normalized.values,
    hue=vc_attack_types_normalized.index, palette="Set2",
    edgecolor="black"
)
for label in vc_attack_types_normalized.index:
    plt.text(
        x=label, 
        y=vc_attack_types_normalized[label] + 0.01,
        s=f"{vc_attack_types_normalized[label]:.2%}",
        ha='center', va='bottom'
    )
plt.title("Normalized Distribution of Attack Types")
plt.xlabel("Attack Type")
plt.ylabel("Normalized Count")
plt.xticks(rotation=90)
plt.show()

# Feature engineering

In [None]:
y_binary = df.pop("Attack_label")
y_multi = df.pop("Attack_type")
X = df.copy()

In [None]:
X.dtypes

In [None]:
# Separate numerical and categorical features for further analysis
num_features_df = X.select_dtypes(include=[np.number])
cat_features_df = X.select_dtypes(exclude=[np.number])

In [None]:
cat_features_df.head()

In [None]:
# Check most frequent values in categorical features
for col in cat_features_df.columns:
    print(cat_features_df[col].value_counts().head(10))
    print("\n\n")

In [None]:
# Replace 0.0 and 0 with "unknown" in categorical features
cat_features_df = cat_features_df.copy()
for col in cat_features_df.columns:
    cat_features_df[col] = cat_features_df[col].replace(["0.0", "0"], "unknown")

In [None]:
cat_features_df.head(50)

In [None]:
# One-hot encoding of categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat_features = encoder.fit_transform(cat_features_df)

# Convert encoded features back to DataFrame
encoded_cat_features_df = pd.DataFrame(
    encoded_cat_features,
    columns=encoder.get_feature_names_out(cat_features_df.columns),
    index=cat_features_df.index
)

In [None]:
encoded_cat_features_df.head()

In [None]:
# Replace special characters in column names (as xgboost does not like them)
regex = re.compile(r"[\[\]<>]")
encoded_cat_features_df.columns = [
    regex.sub("_", col) if any(x in str(col) for x in set(("[", "]", "<"))) else col
    for col in encoded_cat_features_df.columns.values
]

In [None]:
# Merge numerical and encoded categorical features back
X = pd.concat([num_features_df, encoded_cat_features_df], axis=1)
X.shape

# Train-test split binary

In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train_binary, y_test_binary = train_test_split(
    X, y_binary, test_size=0.2, random_state=42, stratify=y_binary
)

# Binary classification with Random Forest

In [None]:
# # Ranom Forest Classifier
# rf_classifier = RandomForestClassifier(random_state=42)
# rf_classifier.fit(X_train, y_train_binary)

In [None]:
# y_pred_binary = rf_classifier.predict(X_test)

# print(classification_report(y_test_binary, y_pred_binary))

In [None]:
# sns.heatmap(
#     confusion_matrix(y_test_binary, y_pred_binary),
#     annot=True, fmt='d', cmap='Blues',
#     xticklabels=rf_classifier.classes_,
#     yticklabels=rf_classifier.classes_
# )
# plt.title("Confusion Matrix - Random Forest Classifier")
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.show()

In [None]:
# # Plot feature importances
# rf_feature_importances = rf_classifier.feature_importances_.argsort()[::-1][:15]
# plt.figure(figsize=(15, 5))
# plt.bar(range(len(rf_feature_importances)), rf_classifier.feature_importances_[rf_feature_importances], align="center")
# plt.xticks(range(len(rf_feature_importances)), X_train.columns[rf_feature_importances], rotation=90)
# plt.title("Random Forest Classifier - Feature Importances")
# plt.show()

# Binary classification with XGBoost

In [None]:
# # XGBoost Classifier
# xgb_classifier = xgb.XGBClassifier(random_state=42)
# xgb_classifier.fit(X_train, y_train_binary)

In [None]:
# y_pred_binary = xgb_classifier.predict(X_test)

# print(classification_report(y_test_binary, y_pred_binary))

In [None]:
# sns.heatmap(
#     confusion_matrix(y_test_binary, y_pred_binary),
#     annot=True, fmt='d', cmap='Blues',
#     xticklabels=xgb_classifier.classes_,
#     yticklabels=xgb_classifier.classes_
# )
# plt.title("Confusion Matrix - XGBoost Classifier")
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.show()

In [None]:
# # Plot feature importances
# xgb_feature_importances = xgb_classifier.feature_importances_.argsort()[::-1][:15]
# plt.figure(figsize=(15, 5))
# plt.bar(range(len(xgb_feature_importances)), xgb_classifier.feature_importances_[xgb_feature_importances], align="center")
# plt.xticks(range(len(xgb_feature_importances)), X_train.columns[xgb_feature_importances], rotation=90)
# plt.title("XGBoost Classifier - Feature Importances")
# plt.show()

# Train-test split multi

In [None]:
# Encoding categorical features for multi-class classification
encoder_multi = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
y_multi_encoded = encoder_multi.fit_transform(y_multi.values.reshape(-1, 1))

# Train-test split for multi-class classification
X_train, X_test, y_train_multi, y_test_multi = train_test_split(
    X, y_multi_encoded, test_size=0.2, random_state=42, stratify=y_multi_encoded
)


In [None]:
# XGBoost Classifier for multi-class classification
n_classes = y_multi.nunique()
xgb_classifier_multi = xgb.XGBClassifier(objective='multi:softmax', num_class=n_classes, random_state=42)
xgb_classifier_multi.fit(X_train, y_train_multi)
y_pred_multi = xgb_classifier_multi.predict(X_test)

In [None]:
# Evaluate the multi-class classification model
print(classification_report(y_test_multi, y_pred_multi, target_names=encoder_multi.categories_[0].tolist()))