# Data Exploration

This notebook will primarily explore the dataset, and using insights gained from the EDA, experiment with preprocessing steps.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split


In [None]:
def summarise_df(df: pd.DataFrame):
    print(f"Shape: {df.shape}")
    print("\nColumn types:\n", df.dtypes.value_counts())
    print(f"\nConstant columns:\n{df.nunique()[df.nunique() <= 1]}\n")
    df.info(verbose=True, show_counts=True, max_cols=None)
    print(df.describe(include='all'))

def print_unique_values(df: pd.DataFrame):
    for col in df.columns:
        if col == 'Attack_label' or col == 'Attack_type':
            continue
        if len(df[col].value_counts()) > 10:
            continue
        print("Unique values of ", col, ":", df[col].value_counts())

def show_target_distribution(df: pd.DataFrame):
    sns.countplot(data=df, x='Attack_label')
    plt.title("Binary Attack Label Distribution")
    plt.tight_layout()
    plt.show()
    print(df['Attack_label'].value_counts())


def plot_correlation_heatmap(df: pd.DataFrame, threshold: float = 0.9):
    corr = df.select_dtypes(include='number').corr()
    # Identify highly correlated pairs
    high_corr = ((corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
                  .stack()
                  .reset_index()
                  .rename(columns={0: 'correlation'}))
                 .query('abs(correlation) > @threshold'))
    print("Highly correlated features (>|0.9|):\n", high_corr)

    # plot
    plt.figure(figsize=(10,8))
    sns.heatmap(corr, cmap='coolwarm', center=0)
    plt.title("Feature Correlation Heatmap")
    plt.show()


In [None]:
path = "./dataset/edge-iiotset/"

normal_path = path + ("normal/")
attack_path = path + ("attack/")
eval_path = "./dataset/edge-iiotset/eval/"
# output_path = "./dataset/edge-iiotset/eda_output/"


## Read, Load, Examine


In [None]:
# df = pd.read_csv(eval_path+'DNN-EdgeIIoT-dataset.csv', encoding='utf-8', low_memory=False)
df = pd.read_csv(eval_path+'ML-EdgeIIoT-dataset.csv', encoding='utf-8', low_memory=False)

# visualise_df(df)
plot_correlation_heatmap(df)

summarise_df(df)

print_unique_values(df)


## Global Preprocessing 

Dropping irrelevant columns, removing duplicate rows

### Data Cleaning


In [None]:
df = df.drop_duplicates()


### Safe Columns to Drop


In [None]:
# cols dropped by domain knowledge - safe to drop as they do not contribute
safe_to_drop_cols = [
    "frame.time",
    "ip.src_host",
    "ip.dst_host",
    "icmp.unused",
    "icmp.transmit_timestamp",
    "http.file_data",
    "http.request.full_uri",
    "tcp.options",
    "tcp.payload",
    "arp.src.proto_ipv4",
    "arp.dst.proto_ipv4",
    "mqtt.msg",
    "mqtt.msg_decoded_as",
    "Attack_type"
]

dropped_df = df.drop(safe_to_drop_cols, axis=1)


In [None]:
# examine dropped_df

summarise_df(dropped_df)
print_unique_values(dropped_df)
show_target_distribution(dropped_df)


### Constant Columns


In [None]:
from collections import defaultdict

constant_columns = [col for col in dropped_df.columns if dropped_df[col].nunique() == 1]

print(f"Constant columns: {constant_columns}")


In [None]:
zero_counts = defaultdict(int)
non_zero_counts = defaultdict(int)
value_counts = {col: defaultdict(int) for col in constant_columns}
total_rows = 0

for path in [normal_path, attack_path]:
    for file in os.listdir(path):
        if file.endswith(".csv"):
            file_path = os.path.join(path, file)
            df = pd.read_csv(file_path, usecols=constant_columns,
                             encoding='utf-8', low_memory=False)
            total_rows += len(df)
            for col in constant_columns:
                zero_counts[col] += (df[col] == 0).sum()
                non_zero_counts[col] += (df[col] != 0).sum()
                for val, count in df[col].value_counts(dropna=True).items():
                    value_counts[col][val] += count


In [None]:
for col in constant_columns:
    total_non_zero = non_zero_counts[col]
    total_zero = zero_counts[col]
    total_unique = len(value_counts[col])
    sparsity = total_zero / total_rows
    dominant_ratio = max(
        value_counts[col].values()) / (total_zero + total_non_zero)

    print(f"{col} — Unique values: {total_unique}, 0-based sparsity: {sparsity:.8%} of {total_rows}, Dominant ratio: {dominant_ratio:.2%}")


### Sparse Columns

In [None]:
sparse_columns = [col for col in dropped_df.columns if col not in constant_columns and col !=
                  "Attack_label" and dropped_df[col].eq(0).sum() >= 0.99]

print(f"Sparse columns: {sparse_columns}")


In [None]:
zero_counts = defaultdict(int)
non_zero_counts = defaultdict(int)
value_counts = {col: defaultdict(int) for col in sparse_columns}
total_rows = 0

for path in [normal_path, attack_path]:
    for file in os.listdir(path):
        if file.endswith(".csv"):
            file_path = os.path.join(path, file)
            df = pd.read_csv(file_path, usecols=sparse_columns,
                             encoding='utf-8', low_memory=False)
            total_rows += len(df)
            for col in sparse_columns:
                zero_counts[col] += (df[col] == 0).sum()
                non_zero_counts[col] += (df[col] != 0).sum()
                for val, count in df[col].value_counts(dropna=True).items():
                    value_counts[col][val] += count


In [None]:
for col in sparse_columns:
    total_non_zero = non_zero_counts[col]
    total_zero = zero_counts[col]
    total_unique = len(value_counts[col])
    sparsity = total_zero / total_rows
    dominant_ratio = max(
        value_counts[col].values()) / (total_zero + total_non_zero)

    print(f"{col} — Unique values: {total_unique}, 0-based sparsity: {sparsity:.8%} of {total_rows}, Dominant ratio: {dominant_ratio:.2%}")


## Splitting to Training and Testing


In [None]:
# split into train and test
train_df, test_df = train_test_split(dropped_df, test_size=0.2, random_state=42, stratify=dropped_df['Attack_label'])


## Statistical Preprocessing 

In [None]:
# compile list of numerical and categorical features
# categorical if object or <= 10 unique values

categorical_features = []
numerical_features = []

for col in train_df.columns[:-1]:
    if train_df[col].dtype == 'object' or len(train_df[col].unique()) <= 10:
        categorical_features.append(col)
    else:
        numerical_features.append(col)

print("Categorical features:", categorical_features)
print("Numerical features:", numerical_features)


In [None]:
for col in categorical_features:
    print(dropped_df.groupby(col)['Attack_label'].mean().sort_values())


In [None]:
def preserve_and_clean(df, columns):
    for col in columns:
        df[col] = df[col].astype(str).str.strip()
    return df

train_df = preserve_and_clean(train_df, categorical_features)
test_df = preserve_and_clean(test_df, categorical_features)


### Scaling


In [None]:
# scale numerical features
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])


### Encoding


In [None]:
# encode categorical features
from sklearn.preprocessing import OneHotEncoder

onehot = OneHotEncoder()
train_df[categorical_features] = train_df[categorical_features].apply(onehot.fit_transform)

test_df[categorical_features] = test_df[categorical_features].apply(onehot.transform)


In [None]:
summarise_df(train_df)


## Write to File


In [None]:
# save into preprocessed/
# train_df.to_csv("./dataset/edge-iiotset/preprocessed/train.csv", index=False)
# test_df.to_csv("./dataset/edge-iiotset/preprocessed/test.csv", index=False)
