In [2]:
!pip install tensorflow


Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical # Required if using categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
import os

# Ensure TensorFlow is installed: pip install tensorflow

# Define the path to the dataset and output
DATASET_PATH = "/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv"
OUTPUT_PATH = "/content/drive/MyDrive/Colab Notebooks/results"
os.makedirs(OUTPUT_PATH, exist_ok=True)

print(f"Loading dataset: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH, low_memory=False)
print("Dataset loaded successfully.")

# Preprocessing
print(f"Original dataset shape: {df.shape}")
df.replace([float("inf"), float("-inf")], pd.NA, inplace=True)
df.dropna(inplace=True)
print(f"Dataset shape after dropping NA: {df.shape}")

if df.empty:
    print("Dataset is empty after dropping NA values. Exiting.")
    exit()

# Feature Selection (using the same features as Random Forest/SVM for consistency)
drop_cols = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4",
             "arp.dst.proto_ipv4", "http.file_data", "http.request.full_uri",
             "icmp.transmit_timestamp", "tcp.options", "tcp.payload",
             "mqtt.conack.flags", "mqtt.msg", "mqtt.protoname", "mqtt.topic",
             "mqtt.uuid", "mqtt.conflags",
             # "Attack_label", "Attack_type", "Label", # Target and related labels handled separately
             "icmp.unused", "http.request.method", "http.referer", "http.request.version",
             "dns.qry.name", "dns.resp.name", "tcp.flags", "udp.port", "tcp.port",
             "mqtt.conack.flags_tree", # "mqtt.protoname", "mqtt.topic", "mqtt.uuid", "mqtt.conflags", # Duplicates
             "tcp.options.mss", "tcp.window_size", "tcp.hdr_len", "tcp.seq", "tcp.ack",
             "ip.src", "ip.dst", "arp.opcode", "arp.hw.type", "arp.src.hw_mac", #"arp.src.proto_ipv4",
             "arp.dst.hw_mac", #"arp.dst.proto_ipv4",
             "icmp.type", "icmp.code", "icmp.checksum",
             "icmp.ident", "icmp.seq_le", "udp.srcport", "udp.dstport", "udp.checksum",
             "dns.id", "dns.flags.response", "dns.flags.opcode", "dns.flags.authoritative",
             "dns.flags.truncated", "dns.flags.recursion_desired", "dns.flags.recursion_available",
             "dns.flags.z", "dns.flags.authenticated", "dns.flags.checking_disabled", "dns.flags.rcode",
             "dns.count.queries", "dns.count.answers", "dns.count.auth_rr", "dns.count.add_rr",
             "mqtt.clientid", "mqtt.qos", "mqtt.retain", "mqtt.dupflag", "mqtt.sessionpresent",
             "mqtt.proto_len", "mqtt.topic_len", "mqtt.ver", "mqtt.willmsg", "mqtt.willtopic",
             #"mqtt.retain", "mqtt.dup", # Duplicates
             "mqtt.msgtype", "mqtt.kalive", "mqtt.msgid", "mqtt.password",
             "mqtt.username", "mqtt.client_id_len", #"mqtt.topic_len", # Duplicate
             "mqtt.topic_val", "mqtt.msg_len",
             "mqtt.payload", "mqtt.ciphersuite", "mqtt.pk_id", "mqtt.reason_code", "mqtt.session_expiry_interval",
             "mqtt.will_flag", "mqtt.will_qos", "mqtt.will_retain", "mqtt.will_message_len", "mqtt.will_message",
             "mqtt.will_topic_len", "mqtt.will_topic", "mqtt.var_header.length", #"mqtt.var_header.qos",
             #"mqtt.var_header.retain", "mqtt.var_header.dup", # Duplicates
             "mqtt.var_header.message_identifier",
             "mqtt.var_header.topic_name_length", "mqtt.var_header.topic_name", "mqtt.var_header.packet_identifier",
             "mqtt.var_header.properties.message_expiry_interval", "mqtt.var_header.properties.content_type",
             "mqtt.var_header.properties.correlation_data", "mqtt.var_header.properties.payload_format_indicator",
             "mqtt.var_header.properties.request_response_information", "mqtt.var_header.properties.response_topic",
             "mqtt.var_header.properties.session_expiry_interval", "mqtt.var_header.properties.subscription_identifier",
             "mqtt.var_header.properties.topic_alias", "mqtt.var_header.properties.user_property",
             "mqtt.var_header.properties.will_delay_interval", "mqtt.var_header.properties.will_payload_format_indicator",
             "mqtt.var_header.properties.will_content_type", "mqtt.var_header.properties.will_response_topic",
             "mqtt.var_header.properties.will_correlation_data", "mqtt.var_header.properties.will_user_property",
             "mqtt.var_header.properties.will_subscription_identifier", "mqtt.var_header.properties.will_topic_alias",
             "mqtt.var_header.properties.will_retained_message", "mqtt.var_header.properties.will_message_expiry_interval",
             "mqtt.var_header.properties.will_content_type_len", "mqtt.var_header.properties.will_content_type_val",
             "mqtt.var_header.properties.will_response_topic_len", "mqtt.var_header.properties.will_response_topic_val",
             "mqtt.var_header.properties.will_correlation_data_len", "mqtt.var_header.properties.will_correlation_data_val",
             "mqtt.var_header.properties.will_user_property_len", "mqtt.var_header.properties.will_user_property_val",
             "mqtt.var_header.properties.will_subscription_identifier_len", "mqtt.var_header.properties.will_subscription_identifier_val",
             "mqtt.var_header.properties.will_topic_alias_len", "mqtt.var_header.properties.will_topic_alias_val",
             "mqtt.var_header.properties.will_retained_message_len", "mqtt.var_header.properties.will_retained_message_val",
             "mqtt.var_header.properties.will_message_expiry_interval_len", "mqtt.var_header.properties.will_message_expiry_interval_val"
             ]
# Remove duplicates from drop_cols to be safe
drop_cols = sorted(list(set(drop_cols)))

if "Attack_label" in df.columns:
    y = df["Attack_label"]
    # Drop target and related/auxiliary labels first
    X_candidate_features = df.drop(columns=["Attack_label", "Attack_type", "Label"], errors="ignore")
    # Drop other specified columns
    X_candidate_features = X_candidate_features.drop(columns=[col for col in drop_cols if col in X_candidate_features.columns], errors="ignore")
    # Select only numeric features
    X = X_candidate_features.select_dtypes(include=np.number)
else:
    print("Target variable Attack_label not found. Exiting.")
    exit()

if X.empty:
    print("No numeric features available after selection. Exiting.")
    exit()

print(f"Number of features before selection: {len(X_candidate_features.columns)}")
print(f"Selected features for training ({len(X.columns)}): {X.columns.tolist()}")

# Data Splitting
if y.empty or len(y.unique()) < 2:
    print("Target variable y is empty or has only one class. Stratified splitting not possible. Exiting.")
    exit()

num_classes = len(y.unique())
print(f"Number of unique classes in target variable: {num_classes}")

# Ensure y is integer type for sparse_categorical_crossentropy or if using to_categorical
y = y.astype(int)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=(0.15/0.85), random_state=42, stratify=y_train_val)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Reshape data for LSTM: (samples, timesteps, features)
# Here, we treat each sample as a sequence of 1 timestep
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_val_lstm = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

print(f"Training set shape: {X_train.shape}, LSTM-ready: {X_train_lstm.shape}")
print(f"Validation set shape: {X_val.shape}, LSTM-ready: {X_val_lstm.shape}")
print(f"Test set shape: {X_test.shape}, LSTM-ready: {X_test_lstm.shape}")

# LSTM Model Definition
model_lstm = Sequential([
    LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True), # Or False if next layer is not recurrent
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(num_classes, activation="softmax")
])

# Compile the model
model_lstm.compile(optimizer="adam",
                   loss="sparse_categorical_crossentropy", # Use if y is integer-encoded labels
                   metrics=["accuracy"])

model_lstm.summary()

# Callbacks
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train the LSTM model
print("Training LSTM model...")
history_lstm = model_lstm.fit(X_train_lstm, y_train,
                            epochs=50,  # Adjust as needed
                            batch_size=64, # Adjust as needed
                            validation_data=(X_val_lstm, y_val),
                            callbacks=[early_stopping],
                            verbose=1)
print("LSTM Model training complete.")

# Evaluate on validation set (already done by EarlyStopping if restore_best_weights=True)
val_loss_lstm, val_accuracy_lstm = model_lstm.evaluate(X_val_lstm, y_val, verbose=0)
print(f"LSTM Validation Accuracy: {val_accuracy_lstm:.4f}")

# Evaluate on test set
print("Evaluating LSTM on test set...")
test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test_lstm, y_test, verbose=0)
print(f"LSTM Test Accuracy: {test_accuracy_lstm:.4f}")

y_test_pred_lstm_probs = model_lstm.predict(X_test_lstm)
y_test_pred_lstm = np.argmax(y_test_pred_lstm_probs, axis=1)

print("LSTM Test Classification Report:")
# Ensure target names are strings if y_test.unique() are numbers
target_names_lstm = [str(i) for i in sorted(y_test.unique())]
print(classification_report(y_test, y_test_pred_lstm, target_names=target_names_lstm))

# Save the results
results_summary_lstm = f"LSTM Model Results:\n"
results_summary_lstm += f"Validation Accuracy: {val_accuracy_lstm:.4f}\n"
results_summary_lstm += f"Test Accuracy: {test_accuracy_lstm:.4f}\n\n"
results_summary_lstm += f"Test Classification Report:\n{classification_report(y_test, y_test_pred_lstm, target_names=target_names_lstm)}\n"

output_file_path_lstm = os.path.join(OUTPUT_PATH, "lstm_results.txt")
with open(output_file_path_lstm, "w") as f:
    f.write(results_summary_lstm)

print(f"LSTM Results saved to {output_file_path_lstm}")

# Save the model (optional)
# model_lstm.save(os.path.join(OUTPUT_PATH, "lstm_model.h5"))
# print(f"LSTM model saved to {os.path.join(OUTPUT_PATH, 'lstm_model.h5')}")

print("LSTM script finished successfully.")



Loading dataset: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv
Dataset loaded successfully.
Original dataset shape: (157800, 63)
Dataset shape after dropping NA: (157800, 63)
Number of features before selection: 30
Selected features for training (27): ['arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack', 'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']
Number of unique classes in target variable: 2
Training set shape: (110460, 27), LSTM-ready: (110460, 1, 27)
Validation set shape: (23670, 27), LSTM-ready: (23670, 1, 27)
Test set shape: (23670, 27), LSTM-ready: (23670,

  super().__init__(**kwargs)


Training LSTM model...
Epoch 1/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.8805 - loss: 0.3332 - val_accuracy: 0.9028 - val_loss: 0.2288
Epoch 2/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9027 - loss: 0.2254 - val_accuracy: 0.8983 - val_loss: 0.2156
Epoch 3/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9057 - loss: 0.2135 - val_accuracy: 0.9006 - val_loss: 0.2083
Epoch 4/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9050 - loss: 0.2065 - val_accuracy: 0.9158 - val_loss: 0.1827
Epoch 5/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9254 - loss: 0.1748 - val_accuracy: 0.9448 - val_loss: 0.1379
Epoch 6/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9441 - loss: 0.1363 - val_accuracy: 0.9444 - val_loss: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical # Required if using categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
import os

# Ensure TensorFlow is installed: pip install tensorflow

# Define the path to the dataset and output
DATASET_PATH = "/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv"
OUTPUT_PATH = "/content/drive/MyDrive/Colab Notebooks/results"
os.makedirs(OUTPUT_PATH, exist_ok=True)

print(f"Loading dataset: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH, low_memory=False)
print("Dataset loaded successfully.")

# Preprocessing
print(f"Original dataset shape: {df.shape}")
df.replace([float("inf"), float("-inf")], pd.NA, inplace=True)
df.dropna(inplace=True)
print(f"Dataset shape after dropping NA: {df.shape}")

if df.empty:
    print("Dataset is empty after dropping NA values. Exiting.")
    exit()

# Feature Selection (using the same features as Random Forest/SVM for consistency)
drop_cols = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4",
             "arp.dst.proto_ipv4", "http.file_data", "http.request.full_uri",
             "icmp.transmit_timestamp", "tcp.options", "tcp.payload",
             "mqtt.conack.flags", "mqtt.msg", "mqtt.protoname", "mqtt.topic",
             "mqtt.uuid", "mqtt.conflags",
             # "Attack_label", "Attack_type", "Label", # Target and related labels handled separately
             "icmp.unused", "http.request.method", "http.referer", "http.request.version",
             "dns.qry.name", "dns.resp.name", "tcp.flags", "udp.port", "tcp.port",
             "mqtt.conack.flags_tree", # "mqtt.protoname", "mqtt.topic", "mqtt.uuid", "mqtt.conflags", # Duplicates
             "tcp.options.mss", "tcp.window_size", "tcp.hdr_len", "tcp.seq", "tcp.ack",
             "ip.src", "ip.dst", "arp.opcode", "arp.hw.type", "arp.src.hw_mac", #"arp.src.proto_ipv4",
             "arp.dst.hw_mac", #"arp.dst.proto_ipv4",
             "icmp.type", "icmp.code", "icmp.checksum",
             "icmp.ident", "icmp.seq_le", "udp.srcport", "udp.dstport", "udp.checksum",
             "dns.id", "dns.flags.response", "dns.flags.opcode", "dns.flags.authoritative",
             "dns.flags.truncated", "dns.flags.recursion_desired", "dns.flags.recursion_available",
             "dns.flags.z", "dns.flags.authenticated", "dns.flags.checking_disabled", "dns.flags.rcode",
             "dns.count.queries", "dns.count.answers", "dns.count.auth_rr", "dns.count.add_rr",
             "mqtt.clientid", "mqtt.qos", "mqtt.retain", "mqtt.dupflag", "mqtt.sessionpresent",
             "mqtt.proto_len", "mqtt.topic_len", "mqtt.ver", "mqtt.willmsg", "mqtt.willtopic",
             #"mqtt.retain", "mqtt.dup", # Duplicates
             "mqtt.msgtype", "mqtt.kalive", "mqtt.msgid", "mqtt.password",
             "mqtt.username", "mqtt.client_id_len", #"mqtt.topic_len", # Duplicate
             "mqtt.topic_val", "mqtt.msg_len",
             "mqtt.payload", "mqtt.ciphersuite", "mqtt.pk_id", "mqtt.reason_code", "mqtt.session_expiry_interval",
             "mqtt.will_flag", "mqtt.will_qos", "mqtt.will_retain", "mqtt.will_message_len", "mqtt.will_message",
             "mqtt.will_topic_len", "mqtt.will_topic", "mqtt.var_header.length", #"mqtt.var_header.qos",
             #"mqtt.var_header.retain", "mqtt.var_header.dup", # Duplicates
             "mqtt.var_header.message_identifier",
             "mqtt.var_header.topic_name_length", "mqtt.var_header.topic_name", "mqtt.var_header.packet_identifier",
             "mqtt.var_header.properties.message_expiry_interval", "mqtt.var_header.properties.content_type",
             "mqtt.var_header.properties.correlation_data", "mqtt.var_header.properties.payload_format_indicator",
             "mqtt.var_header.properties.request_response_information", "mqtt.var_header.properties.response_topic",
             "mqtt.var_header.properties.session_expiry_interval", "mqtt.var_header.properties.subscription_identifier",
             "mqtt.var_header.properties.topic_alias", "mqtt.var_header.properties.user_property",
             "mqtt.var_header.properties.will_delay_interval", "mqtt.var_header.properties.will_payload_format_indicator",
             "mqtt.var_header.properties.will_content_type", "mqtt.var_header.properties.will_response_topic",
             "mqtt.var_header.properties.will_correlation_data", "mqtt.var_header.properties.will_user_property",
             "mqtt.var_header.properties.will_subscription_identifier", "mqtt.var_header.properties.will_topic_alias",
             "mqtt.var_header.properties.will_retained_message", "mqtt.var_header.properties.will_message_expiry_interval",
             "mqtt.var_header.properties.will_content_type_len", "mqtt.var_header.properties.will_content_type_val",
             "mqtt.var_header.properties.will_response_topic_len", "mqtt.var_header.properties.will_response_topic_val",
             "mqtt.var_header.properties.will_correlation_data_len", "mqtt.var_header.properties.will_correlation_data_val",
             "mqtt.var_header.properties.will_user_property_len", "mqtt.var_header.properties.will_user_property_val",
             "mqtt.var_header.properties.will_subscription_identifier_len", "mqtt.var_header.properties.will_subscription_identifier_val",
             "mqtt.var_header.properties.will_topic_alias_len", "mqtt.var_header.properties.will_topic_alias_val",
             "mqtt.var_header.properties.will_retained_message_len", "mqtt.var_header.properties.will_retained_message_val",
             "mqtt.var_header.properties.will_message_expiry_interval_len", "mqtt.var_header.properties.will_message_expiry_interval_val"
             ]
# Remove duplicates from drop_cols to be safe
drop_cols = sorted(list(set(drop_cols)))

if "Attack_label" in df.columns:
    y = df["Attack_label"]
    # Drop target and related/auxiliary labels first
    X_candidate_features = df.drop(columns=["Attack_label", "Attack_type", "Label"], errors="ignore")
    # Drop other specified columns
    X_candidate_features = X_candidate_features.drop(columns=[col for col in drop_cols if col in X_candidate_features.columns], errors="ignore")
    # Select only numeric features
    X = X_candidate_features.select_dtypes(include=np.number)
else:
    print("Target variable Attack_label not found. Exiting.")
    exit()

if X.empty:
    print("No numeric features available after selection. Exiting.")
    exit()

print(f"Number of features before selection: {len(X_candidate_features.columns)}")
print(f"Selected features for training ({len(X.columns)}): {X.columns.tolist()}")

# Data Splitting
if y.empty or len(y.unique()) < 2:
    print("Target variable y is empty or has only one class. Stratified splitting not possible. Exiting.")
    exit()

num_classes = len(y.unique())
print(f"Number of unique classes in target variable: {num_classes}")

# Ensure y is integer type for sparse_categorical_crossentropy or if using to_categorical
y = y.astype(int)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=(0.15/0.85), random_state=42, stratify=y_train_val)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

# MLP Model Definition
model_mlp = Sequential([
    Dense(128, activation="relu", input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(32, activation="relu"),
    Dropout(0.2),
    Dense(num_classes, activation="softmax")
])

# Compile the model
model_mlp.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy", # Use if y is integer-encoded labels
                  metrics=["accuracy"])

model_mlp.summary()

# Callbacks
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

# Train the MLP model
print("Training MLP model...")
history_mlp = model_mlp.fit(X_train, y_train,
                            epochs=100, # Adjust as needed
                            batch_size=64, # Adjust as needed
                            validation_data=(X_val, y_val),
                            callbacks=[early_stopping],
                            verbose=1)
print("MLP Model training complete.")

# Evaluate on validation set (already done by EarlyStopping if restore_best_weights=True)
val_loss_mlp, val_accuracy_mlp = model_mlp.evaluate(X_val, y_val, verbose=0)
print(f"MLP Validation Accuracy: {val_accuracy_mlp:.4f}")

# Evaluate on test set
print("Evaluating MLP on test set...")
test_loss_mlp, test_accuracy_mlp = model_mlp.evaluate(X_test, y_test, verbose=0)
print(f"MLP Test Accuracy: {test_accuracy_mlp:.4f}")

y_test_pred_mlp_probs = model_mlp.predict(X_test)
y_test_pred_mlp = np.argmax(y_test_pred_mlp_probs, axis=1)

print("MLP Test Classification Report:")
# Ensure target names are strings if y_test.unique() are numbers
target_names_mlp = [str(i) for i in sorted(y_test.unique())]
print(classification_report(y_test, y_test_pred_mlp, target_names=target_names_mlp))

# Save the results
results_summary_mlp = f"MLP Model Results:\n"
results_summary_mlp += f"Validation Accuracy: {val_accuracy_mlp:.4f}\n"
results_summary_mlp += f"Test Accuracy: {test_accuracy_mlp:.4f}\n\n"
results_summary_mlp += f"Test Classification Report:\n{classification_report(y_test, y_test_pred_mlp, target_names=target_names_mlp)}\n"

output_file_path_mlp = os.path.join(OUTPUT_PATH, "mlp_results.txt")
with open(output_file_path_mlp, "w") as f:
    f.write(results_summary_mlp)

print(f"MLP Results saved to {output_file_path_mlp}")

# Save the model (optional)
# model_mlp.save(os.path.join(OUTPUT_PATH, "mlp_model.h5"))
# print(f"MLP model saved to {os.path.join(OUTPUT_PATH, 'mlp_model.h5')}")

print("MLP script finished successfully.")



Loading dataset: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv
Dataset loaded successfully.
Original dataset shape: (157800, 63)
Dataset shape after dropping NA: (157800, 63)
Number of features before selection: 30
Selected features for training (27): ['arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack', 'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']
Number of unique classes in target variable: 2
Training set shape: (110460, 27)
Validation set shape: (23670, 27)
Test set shape: (23670, 27)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Training MLP model...
Epoch 1/100
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8826 - loss: 0.2984 - val_accuracy: 0.9403 - val_loss: 0.1695
Epoch 2/100
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9326 - loss: 0.1603 - val_accuracy: 0.9400 - val_loss: 0.1294
Epoch 3/100
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9427 - loss: 0.1310 - val_accuracy: 0.9407 - val_loss: 0.1243
Epoch 4/100
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9437 - loss: 0.1267 - val_accuracy: 0.9385 - val_loss: 0.1193
Epoch 5/100
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9428 - loss: 0.1213 - val_accuracy: 0.9456 - val_loss: 0.1136
Epoch 6/100
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9442 - loss: 0.1185 - val_accuracy: 0.9423 - val_l

In [None]:
model_lstm.save("/content/drive/MyDrive/Colab Notebooks/lstm_model.h5") # Change model to model_lstm or model_mlp depending on which you want to save



In [4]:
# @title Modified MLP_LSTM.ipynb (Corrected - for training and saving model/scaler)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import os
import pickle # For saving the scaler
from datetime import datetime # <<< --- ADDED THIS IMPORT

# --- Configuration ---
# Ensure this path points to your dataset
DATASET_PATH = "/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv"

# OUTPUT_PATH for saving model and scaler
OUTPUT_PATH = "/content/drive/MyDrive/Colab Notebooks/results"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Define model and scaler save paths
MODEL_SAVE_PATH = os.path.join(OUTPUT_PATH, "lstm_model.h5")
SCALER_SAVE_PATH = os.path.join(OUTPUT_PATH, "scaler.pkl")

print(f"Attempting to load dataset from: {DATASET_PATH}")
try:
    df = pd.read_csv(DATASET_PATH, low_memory=False)
    print("✅ Dataset loaded successfully.")
except FileNotFoundError:
    print(f"❌ ERROR: Dataset file not found at {DATASET_PATH}")
    print("   Please ensure the path is correct and your Google Drive is mounted.")
    raise FileNotFoundError(f"Dataset not found at {DATASET_PATH}")


# --- Preprocessing (as per your original MLP_LSTM.ipynb) ---
print(f"Original dataset shape: {df.shape}")
df.replace([float("inf"), float("-inf")], pd.NA, inplace=True)
df.dropna(inplace=True)
print(f"Dataset shape after dropping NA: {df.shape}")

if df.empty:
    print("Dataset is empty after dropping NA values. Exiting.")
    raise ValueError("Dataset empty after preprocessing.")

SELECTED_FEATURES = [
    'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
    'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
    'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
    'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
    'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
    'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
    'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id'
]

if "Attack_label" in df.columns:
    y = df["Attack_label"]
    missing_features = [col for col in SELECTED_FEATURES if col not in df.columns]
    if missing_features:
        print(f"❌ ERROR: The following selected features are missing from the dataset: {missing_features}")
        raise ValueError(f"Missing features: {missing_features}")
    X = df[SELECTED_FEATURES]
else:
    print("❌ ERROR: Target variable 'Attack_label' not found. Exiting.")
    raise ValueError("Target variable 'Attack_label' not found.")

if X.empty:
    print("❌ ERROR: No features available after selection. Exiting.")
    raise ValueError("Feature set X is empty after selection.")

print(f"\nSelected features for training ({len(X.columns)}): {X.columns.tolist()}")
FEATURE_NAMES_FOR_MODEL = X.columns.tolist()

if y.empty or len(y.unique()) < 2:
    print("❌ ERROR: Target variable y is empty or has only one class. Stratified splitting not possible. Exiting.")
    raise ValueError("Target variable y is unsuitable for splitting.")

num_classes = len(y.unique())
print(f"Number of unique classes in target variable: {num_classes}")
y = y.astype(int)

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=(0.15/0.85), random_state=42, stratify=y_train_val)

print(f"\nTraining set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Test set shape: {X_test.shape}")

# --- Feature Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# --- MODIFICATION: Save the fitted scaler ---
try:
    with open(SCALER_SAVE_PATH, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"\n✅ Scaler saved successfully to: {SCALER_SAVE_PATH}")
except Exception as e:
    print(f"\n❌ Error saving scaler: {e}")

X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_val_lstm = X_val_scaled.reshape((X_val_scaled.shape[0], 1, X_val_scaled.shape[1]))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

print(f"\nLSTM-ready training set shape: {X_train_lstm.shape}")
print(f"LSTM-ready validation set shape: {X_val_lstm.shape}")
print(f"LSTM-ready test set shape: {X_test_lstm.shape}")

model_lstm = Sequential([
    LSTM(64, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(num_classes, activation="softmax")
])

model_lstm.compile(optimizer="adam",
                   loss="sparse_categorical_crossentropy",
                   metrics=["accuracy"])

print("\n--- LSTM Model Summary ---")
model_lstm.summary()

early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)

print("\n⏳ Training LSTM model...")
history_lstm = model_lstm.fit(X_train_lstm, y_train,
                            epochs=50,
                            batch_size=64,
                            validation_data=(X_val_lstm, y_val),
                            callbacks=[early_stopping],
                            verbose=1) # Set to 1 to see training progress, 2 for less verbosity per epoch
print("✅ LSTM Model training complete.")

val_loss_lstm, val_accuracy_lstm = model_lstm.evaluate(X_val_lstm, y_val, verbose=0)
print(f"\nLSTM Validation Accuracy: {val_accuracy_lstm:.4f}")

print("\n🧪 Evaluating LSTM on test set...")
test_loss_lstm, test_accuracy_lstm = model_lstm.evaluate(X_test_lstm, y_test, verbose=0)
print(f"LSTM Test Accuracy: {test_accuracy_lstm:.4f}")

y_test_pred_lstm_probs = model_lstm.predict(X_test_lstm)
y_test_pred_lstm = np.argmax(y_test_pred_lstm_probs, axis=1)

print("\n--- LSTM Test Classification Report ---")
target_names_lstm = ["Normal (0)", "Attack (1)"]
print(classification_report(y_test, y_test_pred_lstm, target_names=target_names_lstm))

try:
    model_lstm.save(MODEL_SAVE_PATH)
    print(f"\n✅ LSTM Model saved successfully to: {MODEL_SAVE_PATH}")
except Exception as e:
    print(f"\n❌ Error saving LSTM model: {e}")

results_summary_lstm = f"LSTM Model Results (from {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}):\n" # Corrected line
results_summary_lstm += f"Dataset: {DATASET_PATH}\n"
results_summary_lstm += f"Features used ({len(FEATURE_NAMES_FOR_MODEL)}): {FEATURE_NAMES_FOR_MODEL}\n"
results_summary_lstm += f"Validation Accuracy: {val_accuracy_lstm:.4f}\n"
results_summary_lstm += f"Test Accuracy: {test_accuracy_lstm:.4f}\n\n"
results_summary_lstm += f"Test Set Classification Report:\n{classification_report(y_test, y_test_pred_lstm, target_names=target_names_lstm)}\n"
results_summary_lstm += f"\nModel saved to: {MODEL_SAVE_PATH}\n"
results_summary_lstm += f"Scaler saved to: {SCALER_SAVE_PATH}\n"

output_file_path_lstm = os.path.join(OUTPUT_PATH, "lstm_training_summary_results.txt")
with open(output_file_path_lstm, "w") as f:
    f.write(results_summary_lstm)

print(f"\n📄 LSTM Training Summary and Results saved to {output_file_path_lstm}")
print("\n--- MLP_LSTM.ipynb (Corrected Version) --- Script finished successfully. ---")

Attempting to load dataset from: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv
✅ Dataset loaded successfully.
Original dataset shape: (157800, 63)
Dataset shape after dropping NA: (157800, 63)

Selected features for training (27): ['arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack', 'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']
Number of unique classes in target variable: 2

Training set shape: (110460, 27)
Validation set shape: (23670, 27)
Test set shape: (23670, 27)

✅ Scaler saved successfully to: /content/drive/MyDrive/Colab Notebooks/results/scaler.pkl

LSTM-

  super().__init__(**kwargs)



⏳ Training LSTM model...
Epoch 1/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - accuracy: 0.8853 - loss: 0.3292 - val_accuracy: 0.8971 - val_loss: 0.2272
Epoch 2/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9009 - loss: 0.2293 - val_accuracy: 0.9027 - val_loss: 0.2174
Epoch 3/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9029 - loss: 0.2178 - val_accuracy: 0.9033 - val_loss: 0.2125
Epoch 4/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9036 - loss: 0.2104 - val_accuracy: 0.9039 - val_loss: 0.2023
Epoch 5/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9082 - loss: 0.1961 - val_accuracy: 0.9256 - val_loss: 0.1597
Epoch 6/50
[1m1726/1726[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.9350 - loss: 0.1552 - val_accuracy: 0.9428 - val_los




--- LSTM Test Classification Report ---
              precision    recall  f1-score   support

  Normal (0)       0.93      0.89      0.91      3645
  Attack (1)       0.98      0.99      0.98     20025

    accuracy                           0.97     23670
   macro avg       0.96      0.94      0.95     23670
weighted avg       0.97      0.97      0.97     23670


✅ LSTM Model saved successfully to: /content/drive/MyDrive/Colab Notebooks/results/lstm_model.h5

📄 LSTM Training Summary and Results saved to /content/drive/MyDrive/Colab Notebooks/results/lstm_training_summary_results.txt

--- MLP_LSTM.ipynb (Corrected Version) --- Script finished successfully. ---
