In [3]:
# IoT Network Attack Detection Dataset Balancing Pipeline

import dask.dataframe as dd
from dask_ml.preprocessing import StandardScaler
from sklearn.utils import resample, shuffle
from imblearn.over_sampling import SMOTE
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import gc
import os

In [9]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [10]:
# === SETTINGS === #
PREPROCESSED_PATH = "/content/drive/MyDrive/Dataset/CICIoT2023_preprocessed.parquet"
SAVE_DIR = "/content/drive/MyDrive/Dataset"

# === LOAD PREPROCESSED DATA === #
print("\n[1] Loading preprocessed dataset...")
df = dd.read_parquet(PREPROCESSED_PATH)
df = df.persist()
print("Rows:", len(df))
print("Columns:", df.columns.tolist())


[1] Loading preprocessed dataset...
Rows: 46686545
Columns: ['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number', 'rst_flag_number', 'psh_flag_number', 'ack_flag_number', 'ack_count', 'syn_count', 'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'SSH', 'TCP', 'UDP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue', 'Radius', 'Covariance', 'Variance', 'Weight', 'label']


In [11]:
# ==================== 2-CLASS BALANCING ==================== #
try:
    print("\n[2] Starting 2-class balancing...")

    df_2class = df.assign(label=df['label'].map(lambda x: 'Attack' if x != 'BenignTraffic' else 'Benign', meta=('label', 'str')))
    label_counts = df_2class['label'].value_counts().compute()
    print("Label counts before balancing:\n", label_counts)

    target = 8450

    def sample_class(df, label, n):
        return resample(df[df['label'] == label].compute(), n_samples=n, random_state=42).reset_index(drop=True)

    benign_df = sample_class(df_2class, 'Benign', target)
    attack_df = sample_class(df_2class, 'Attack', target)

    balanced_2class_df = shuffle(pd.concat([benign_df, attack_df], ignore_index=True), random_state=42)
    print(f"→ Final balanced shape: {balanced_2class_df.shape}")

    dd.from_pandas(balanced_2class_df, npartitions=1).to_parquet(f"{SAVE_DIR}/CICIoT2023_balanced_2class.parquet")
    print("[✓] Saved balanced 2-class dataset.")

    del df_2class, benign_df, attack_df, balanced_2class_df
    gc.collect()

except Exception as e:
    print("[✗] Error in 2-class balancing:", e)


[2] Starting 2-class balancing...
Label counts before balancing:
 label
Attack    45588350
Benign     1098195
Name: count, dtype: int64
→ Final balanced shape: (16900, 41)
[✓] Saved balanced 2-class dataset.


In [19]:
print("\n[8-CLASS SMOTE + UNDERSAMPLING] Starting process...")

TARGET_SAMPLES = 33800
SAVE_PATH = f"{SAVE_DIR}/CICIoT2023_balanced_8class_strict.parquet"
TEMP_PATH = f"{SAVE_DIR}/temp_8class"
os.makedirs(TEMP_PATH, exist_ok=True)

category_map = {
    'DDoS': ['DDoS-RSTFINFlood', 'DDoS-ICMP_Flood', 'DDoS-SynonymousIP_Flood', 'DDoS-SYN_Flood',
             'DDoS-PSHACK_Flood', 'DDoS-TCP_Flood', 'DDoS-UDP_Flood', 'DDoS-ACK_Fragmentation',
             'DDoS-ICMP_Fragmentation', 'DDoS-UDP_Fragmentation', 'DDoS-HTTP_Flood', 'DDoS-SlowLoris'],
    'DoS': ['DoS-TCP_Flood', 'DoS-UDP_Flood', 'DoS-SYN_Flood', 'DoS-HTTP_Flood'],
    'Reconnaissance': ['Recon-PortScan', 'Recon-OSScan', 'Recon-HostDiscovery', 'Recon-PingSweep', 'VulnerabilityScan'],
    'Web-Based': ['XSS', 'SqlInjection', 'CommandInjection', 'Uploading_Attack', 'BrowserHijacking'],
    'Brute Force': ['DictionaryBruteForce', 'Backdoor_Malware'],
    'Spoofing': ['MITM-ArpSpoofing', 'DNS_Spoofing'],
    'Mirai': ['Mirai-greeth_flood', 'Mirai-udpplain', 'Mirai-greip_flood'],
    'Benign': ['BenignTraffic']
}

def map_label(label):
    for group, members in category_map.items():
        if label in members:
            return group
    return 'Other'

df_8class = df.assign(label=df['label'].map(map_label, meta=('label', 'str')))
label_counts = df_8class['label'].value_counts().compute()
print("Label counts before balancing:\n", label_counts)

# Lists to hold partial data
under_chunks = []
X_smote_list = []
y_smote_list = []

unique_labels = df_8class['label'].unique().compute()

for label in tqdm(unique_labels):
    class_df = df_8class[df_8class['label'] == label].compute()
    print(f"\n→ {label}: {len(class_df)} samples")

    if len(class_df) >= TARGET_SAMPLES:
        reduced = resample(class_df, n_samples=TARGET_SAMPLES, random_state=42)
        under_chunks.append(reduced)
    elif len(class_df) > 1:
        X_smote_list.append(class_df.drop(columns='label'))
        y_smote_list.append(class_df['label'])
    else:
        print(f"⚠️ Skipping '{label}' — Not enough samples to apply SMOTE.")

# Save intermediate SMOTE-ready data to Drive
X_smote_temp_path = os.path.join(TEMP_PATH, "X_smote.parquet")
y_smote_temp_path = os.path.join(TEMP_PATH, "y_smote.parquet")

print("\n[ℹ️] Saving SMOTE data temporarily to Google Drive...")
pd.concat(X_smote_list, ignore_index=True).to_parquet(X_smote_temp_path)
pd.concat(y_smote_list, ignore_index=True).to_frame().to_parquet(y_smote_temp_path)

# Clear memory
del X_smote_list, y_smote_list, df_8class
gc.collect()

# Load back for SMOTE
print("[ℹ️] Loading back SMOTE data for resampling...")
X_smote = pd.read_parquet(X_smote_temp_path)
y_smote = pd.read_parquet(y_smote_temp_path).squeeze()

print("[ℹ️] Applying SMOTE...")
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_smote, y_smote)
smote_df = pd.DataFrame(X_res, columns=X_smote.columns)
smote_df['label'] = y_res

# Merge and shuffle
final_df = shuffle(pd.concat(under_chunks + [smote_df], ignore_index=True), random_state=42)
print(f"\n✅ Final balanced dataset shape: {final_df.shape}")

# Save final dataset to Drive
dd.from_pandas(final_df, npartitions=4).to_parquet(SAVE_PATH)
print(f"[✓] Successfully saved to: {SAVE_PATH}")

# Cleanup
del final_df, smote_df, under_chunks, X_smote, y_smote
gc.collect()



[8-CLASS SMOTE + UNDERSAMPLING] Starting process...
Label counts before balancing:
 label
Brute Force          16282
DDoS              33984533
Benign             1098195
Spoofing            486503
DoS                8090737
Mirai              2634124
Reconnaissance      354560
Web-Based            21611
Name: count, dtype: int64


  0%|          | 0/8 [00:00<?, ?it/s]


→ Brute Force: 16282 samples

→ DDoS: 33984533 samples

→ Benign: 1098195 samples

→ Spoofing: 486503 samples

→ DoS: 8090737 samples

→ Mirai: 2634124 samples

→ Reconnaissance: 354560 samples

→ Web-Based: 21611 samples

[ℹ️] Saving SMOTE data temporarily to Google Drive...
[ℹ️] Loading back SMOTE data for resampling...
[ℹ️] Applying SMOTE...

✅ Final balanced dataset shape: (246022, 41)
[✓] Successfully saved to: /content/drive/MyDrive/CICIoT2023/CICIoT2023_balanced_8class_strict.parquet


50

In [18]:
# ✅ Set your save directory in Google Drive
SAVE_DIR = "/content/drive/MyDrive/CICIoT2023"
os.makedirs(SAVE_DIR, exist_ok=True)
SAVE_PATH = f"{SAVE_DIR}/CICIoT2023_balanced_34class_strict.parquet"

print("\n[34-CLASS SMOTE + UNDERSAMPLING] Starting process...")

TARGET_SAMPLES = 84500
df_34class = df.copy()
label_counts = df_34class['label'].value_counts().compute()
print("Label counts before processing:\n", label_counts)

# Store undersampled and SMOTE-ready data separately
under_chunks = []
over_X = []
over_y = []

unique_labels = df_34class['label'].unique().compute()

for label in tqdm(unique_labels):
    class_df = df_34class[df_34class['label'] == label].compute()
    print(f"\n→ {label}: {len(class_df)} samples")

    if len(class_df) > TARGET_SAMPLES:
        reduced = resample(class_df, n_samples=TARGET_SAMPLES, random_state=42)
        under_chunks.append(reduced)
    elif len(class_df) > 1:
        over_X.append(class_df.drop(columns='label'))
        over_y.append(class_df['label'])
    else:
        print(f"⚠️ Skipping '{label}' — Not enough samples to apply SMOTE.")

# ✅ Apply SMOTE
print("\n[ℹ️] Applying SMOTE to minority classes...")
X_concat = pd.concat(over_X, ignore_index=True)
y_concat = pd.concat(over_y, ignore_index=True)

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_concat, y_concat)
smote_df = pd.DataFrame(X_res, columns=X_concat.columns)
smote_df['label'] = y_res

# ✅ Combine and save
final_df = shuffle(pd.concat(under_chunks + [smote_df], ignore_index=True), random_state=42)
print(f"\n✅ Final balanced dataset shape: {final_df.shape}")

dd.from_pandas(final_df, npartitions=4).to_parquet(SAVE_PATH)
print(f"[✓] Successfully saved to: {SAVE_PATH}")

# ✅ Clean up memory
del df_34class, over_X, over_y, X_concat, y_concat, smote_df, final_df
gc.collect()


[34-CLASS SMOTE + UNDERSAMPLING] Starting process...
Label counts before processing:
 label
DDoS-HTTP_Flood              28790
Backdoor_Malware              3218
Recon-PingSweep               2262
Uploading_Attack              1252
Mirai-udpplain              890576
DDoS-RSTFINFlood           4045282
DDoS-PSHACK_Flood          4094751
SqlInjection                  5245
MITM-ArpSpoofing            307593
DDoS-SynonymousIP_Flood    3598138
Mirai-greip_flood           751682
DDoS-TCP_Flood             4497665
DoS-SYN_Flood              2028834
DictionaryBruteForce         13064
BenignTraffic              1098195
DDoS-SYN_Flood             4059188
CommandInjection              5409
DDoS-UDP_Fragmentation      286925
DoS-HTTP_Flood               71864
DDoS-ICMP_Flood            7200488
DoS-UDP_Flood              3318594
DDoS-SlowLoris               23426
Recon-HostDiscovery         134377
XSS                           3846
DNS_Spoofing                178910
DDoS-UDP_Flood             54122

  0%|          | 0/34 [00:00<?, ?it/s]


→ DDoS-HTTP_Flood: 28790 samples

→ Recon-PingSweep: 2262 samples

→ Backdoor_Malware: 3218 samples

→ Uploading_Attack: 1252 samples

→ Mirai-udpplain: 890576 samples

→ DDoS-RSTFINFlood: 4045282 samples

→ DDoS-PSHACK_Flood: 4094751 samples

→ SqlInjection: 5245 samples

→ MITM-ArpSpoofing: 307593 samples

→ DDoS-SynonymousIP_Flood: 3598138 samples

→ Mirai-greip_flood: 751682 samples

→ DDoS-TCP_Flood: 4497665 samples

→ DoS-SYN_Flood: 2028834 samples

→ DictionaryBruteForce: 13064 samples

→ BenignTraffic: 1098195 samples

→ DDoS-SYN_Flood: 4059188 samples

→ CommandInjection: 5409 samples

→ DDoS-UDP_Fragmentation: 286925 samples

→ DoS-HTTP_Flood: 71864 samples

→ DDoS-ICMP_Flood: 7200488 samples

→ DoS-UDP_Flood: 3318594 samples

→ DDoS-SlowLoris: 23426 samples

→ Recon-HostDiscovery: 134377 samples

→ XSS: 3846 samples

→ DNS_Spoofing: 178910 samples

→ DDoS-UDP_Flood: 5412287 samples

→ VulnerabilityScan: 37382 samples

→ DDoS-ICMP_Fragmentation: 452489 samples

→ Recon-PortS

187