In [3]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/238.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m204.8/238.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


In [4]:
# 1. Setup: Import Libraries and Mount Google Drive
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder # Explicitly import LabelEncoder

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# 2. Load DataFrames and Align Columns
print("🔄 Loading datasets...")
data_path = '/content/drive/MyDrive/Colab Notebooks/datasets/'
file1_path = os.path.join(data_path, 'DNN-EdgeIIoT-dataset.csv')
file2_path = os.path.join(data_path, 'ML-EdgeIIoT-dataset.csv')

df1 = pd.read_csv(file1_path, low_memory=False)
df2 = pd.read_csv(file2_path, low_memory=False)

common_cols = list(set(df1.columns).intersection(set(df2.columns)))
df = pd.concat([df1[common_cols], df2[common_cols]], ignore_index=True)
df.drop_duplicates(inplace=True)
print(f"✅ Data loaded and duplicates dropped. Shape: {df.shape}")

# 3. Drop Unnecessary/Identifier Columns
columns_to_drop = [
    'frame.time', 'ip.src_host', 'ip.dst_host', 'arp.src.proto_ipv4',
    'arp.dst.proto_ipv4', 'http.file_data', 'http.request.uri.query',
    'http.referer', 'http.request.full_uri', 'tcp.options', 'tcp.payload',
    'dns.qry.name', 'dns.qry.name.len', 'mqtt.msg', 'mqtt.topic', 'mbtcp.trans_id'
]
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
print(f"✅ Dropped identifier columns. New shape: {df.shape}")

# 4. Separate Features (X) and Target (y)
y = df['Attack_type'].copy()
X = df.drop(columns=[col for col in ['Attack_type', 'Attack_label'] if col in df.columns])
print(f"✅ Features (X) and target (y) separated.")

# 5. Encode the Target Variable (y)
print("\n--- Encoding Target Variable 'Attack_type' ---")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Attack Type Label Mapping:")
for class_name, idx in mapping.items():
    print(f"  - '{class_name}' -> {idx}")

# 6. Define Preprocessing Pipeline with ColumnTransformer
print("\n--- Defining Preprocessing Logic ---")

# Identify column types based on their content
numeric_features = []
categorical_features = []

for col in X.columns:
    if X[col].dtype == 'object':
        # If a column has few unique string values, treat it as categorical
        if X[col].nunique(dropna=True) <= 20:
            categorical_features.append(col)
        else:
            # Otherwise, attempt to convert it to a number (it's likely a messy numeric column)
            X[col] = pd.to_numeric(X[col], errors='coerce')
            numeric_features.append(col)
    else:
        numeric_features.append(col)

print(f"Identified {len(numeric_features)} numeric features.")
print(f"Identified {len(categorical_features)} categorical features.")

# Create the preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

print("✅ Preprocessing pipeline defined successfully.")

# 7. Split Data into Train/Validation/Test (70/15/15)
print("\n--- Splitting Data (70% train, 15% val, 15% test) ---")
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_encoded, test_size=0.30, random_state=42, stratify=y_encoded
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)
print("✅ Data split complete.")

# 8. Apply the Preprocessing Pipeline
print("\n--- Applying Preprocessing Pipeline to Data Splits ---")
# Fit the preprocessor on the training data and transform it
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the validation and test data using the FITTED preprocessor
X_val_processed = preprocessor.transform(X_val)
X_test_processed = preprocessor.transform(X_test)

# Get the new column names after transformation for readability (optional but recommended)
new_cols = preprocessor.get_feature_names_out()
X_train_scaled = pd.DataFrame(X_train_processed, columns=new_cols)
X_val_scaled = pd.DataFrame(X_val_processed, columns=new_cols)
X_test_scaled = pd.DataFrame(X_test_processed, columns=new_cols)

print(f"✅ Scaling and encoding complete. Final shapes:")
print(f"  - X_train_scaled: {X_train_scaled.shape}")
print(f"  - X_val_scaled:   {X_val_scaled.shape}")
print(f"  - X_test_scaled:  {X_test_scaled.shape}")


# 9. Apply SMOTE on Training Data Only
print("\n--- Applying SMOTE for Class Imbalance on Training Set ---")
print("Class distribution before SMOTE:")
unique_train, counts_train = np.unique(y_train, return_counts=True)
for cls_idx, count in zip(unique_train, counts_train):
    cls_name = label_encoder.inverse_transform([cls_idx])[0]
    print(f"  - {cls_name}: {count} samples")

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("\nClass distribution after SMOTE:")
unique_res, counts_res = np.unique(y_train_resampled, return_counts=True)
for cls_idx, count in zip(unique_res, counts_res):
    cls_name = label_encoder.inverse_transform([cls_idx])[0]
    print(f"  - {cls_name}: {count} samples")

print(f"\n✅ Preprocessing Complete!")
print("You now have:")
print("  • X_train_resampled   (resampled & scaled training features)")
print("  • y_train_resampled   (resampled training labels)")
print("  • X_val_scaled        (scaled validation features)")
print("  • y_val               (validation labels)")
print("  • X_test_scaled       (scaled test features)")
print("  • y_test              (test labels)")

Mounted at /content/drive
🔄 Loading datasets...
✅ Data loaded and duplicates dropped. Shape: (2218387, 63)
✅ Dropped identifier columns. New shape: (2218387, 47)
✅ Features (X) and target (y) separated.

--- Encoding Target Variable 'Attack_type' ---
Attack Type Label Mapping:
  - 'Backdoor' -> 0
  - 'DDoS_HTTP' -> 1
  - 'DDoS_ICMP' -> 2
  - 'DDoS_TCP' -> 3
  - 'DDoS_UDP' -> 4
  - 'Fingerprinting' -> 5
  - 'MITM' -> 6
  - 'Normal' -> 7
  - 'Password' -> 8
  - 'Port_Scanning' -> 9
  - 'Ransomware' -> 10
  - 'SQL_injection' -> 11
  - 'Uploading' -> 12
  - 'Vulnerability_scanner' -> 13
  - 'XSS' -> 14

--- Defining Preprocessing Logic ---
Identified 41 numeric features.
Identified 4 categorical features.
✅ Preprocessing pipeline defined successfully.

--- Splitting Data (70% train, 15% val, 15% test) ---
✅ Data split complete.

--- Applying Preprocessing Pipeline to Data Splits ---
✅ Scaling and encoding complete. Final shapes:
  - X_train_scaled: (1552870, 78)
  - X_val_scaled:   (332758

In [5]:
import joblib

# Define the path on your Google Drive where you want to save the files
save_path = '/content/drive/MyDrive/Colab Notebooks/datasets/processed/'

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

print(f"💾 Saving processed data to {save_path}...")

# Save each variable to a separate file using joblib
# We use joblib because it's highly efficient for large numpy arrays
joblib.dump(X_train_resampled, os.path.join(save_path, 'X_train_resampled.joblib'))
joblib.dump(y_train_resampled, os.path.join(save_path, 'y_train_resampled.joblib'))
joblib.dump(X_val_scaled, os.path.join(save_path, 'X_val_scaled.joblib'))
joblib.dump(y_val, os.path.join(save_path, 'y_val.joblib'))
joblib.dump(X_test_scaled, os.path.join(save_path, 'X_test_scaled.joblib'))
joblib.dump(y_test, os.path.join(save_path, 'y_test.joblib'))
joblib.dump(label_encoder, os.path.join(save_path, 'label_encoder.joblib')) # Also save the encoder!

print("✅ All processed data files have been saved successfully.")

💾 Saving processed data to /content/drive/MyDrive/Colab Notebooks/datasets/processed/...
✅ All processed data files have been saved successfully.
