In [1]:
import joblib

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
import os
import joblib
import pickle # For .pkl saving and serializing objects to bytes for H5
import h5py # For HDF5 file handling

# Define the path to the dataset and output
DATASET_PATH = "/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv"
OUTPUT_PATH = "/content/drive/MyDrive/Colab Notebooks/results"
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Define path for the saved model (joblib)
MODEL_SAVE_PATH_JOBLIB = os.path.join(OUTPUT_PATH, "svm_model.joblib")
SCALER_SAVE_PATH_JOBLIB = os.path.join(OUTPUT_PATH, "svm_scaler.joblib")

# Define path for the saved model (HDF5)
MODEL_SAVE_PATH_H5 = os.path.join(OUTPUT_PATH, "svm_model_and_scaler.h5")

# Define path for the saved model (pickle .pkl)
MODEL_SAVE_PATH_PKL = os.path.join(OUTPUT_PATH, "svm_model.pkl")
SCALER_SAVE_PATH_PKL = os.path.join(OUTPUT_PATH, "svm_scaler.pkl")


print(f"Loading dataset: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH, low_memory=False)
print("Dataset loaded successfully.")

# Preprocessing
df.replace([float("inf"), float("-inf")], pd.NA, inplace=True)
df.dropna(inplace=True)
print(f"Dataset shape after dropping NA: {df.shape}")

if df.empty:
    print("Dataset is empty after dropping NA values. Exiting.")
    exit()

# Feature Selection
drop_cols = ["frame.time", "ip.src_host", "ip.dst_host", "arp.src.proto_ipv4",
             "arp.dst.proto_ipv4", "http.file_data", "http.request.full_uri",
             "icmp.transmit_timestamp", "tcp.options", "tcp.payload",
             "mqtt.conack.flags", "mqtt.msg", "mqtt.protoname", "mqtt.topic",
             "mqtt.uuid", "mqtt.conflags",
             "Attack_label", "Attack_type", "Label",
             "icmp.unused", "http.request.method", "http.referer", "http.request.version",
             "dns.qry.name", "dns.resp.name", "tcp.flags", "udp.port", "tcp.port",
             "mqtt.conack.flags_tree",
             "tcp.options.mss", "tcp.window_size", "tcp.hdr_len", "tcp.seq", "tcp.ack",
             "ip.src", "ip.dst", "arp.opcode", "arp.hw.type", "arp.src.hw_mac",
             "arp.dst.hw_mac", "icmp.type", "icmp.code", "icmp.checksum",
             "icmp.ident", "icmp.seq_le", "udp.srcport", "udp.dstport", "udp.checksum",
             "dns.id", "dns.flags.response", "dns.flags.opcode", "dns.flags.authoritative",
             "dns.flags.truncated", "dns.flags.recursion_desired", "dns.flags.recursion_available",
             "dns.flags.z", "dns.flags.authenticated", "dns.flags.checking_disabled", "dns.flags.rcode",
             "dns.count.queries", "dns.count.answers", "dns.count.auth_rr", "dns.count.add_rr",
             "mqtt.clientid", "mqtt.qos", "mqtt.retain", "mqtt.dupflag", "mqtt.sessionpresent",
             "mqtt.proto_len", "mqtt.topic_len", "mqtt.ver", "mqtt.willmsg", "mqtt.willtopic",
             "mqtt.dup", "mqtt.msgtype", "mqtt.kalive", "mqtt.msgid", "mqtt.password",
             "mqtt.username", "mqtt.client_id_len",
             "mqtt.topic_val", "mqtt.msg_len",
             "mqtt.payload", "mqtt.ciphersuite", "mqtt.pk_id", "mqtt.reason_code", "mqtt.session_expiry_interval",
             "mqtt.will_flag", "mqtt.will_qos", "mqtt.will_retain", "mqtt.will_message_len", "mqtt.will_message",
             "mqtt.will_topic_len", "mqtt.will_topic", "mqtt.var_header.length", "mqtt.var_header.qos",
             "mqtt.var_header.retain", "mqtt.var_header.dup", "mqtt.var_header.message_identifier",
             "mqtt.var_header.topic_name_length", "mqtt.var_header.topic_name", "mqtt.var_header.packet_identifier",
             "mqtt.var_header.properties.message_expiry_interval", "mqtt.var_header.properties.content_type",
             "mqtt.var_header.properties.correlation_data", "mqtt.var_header.properties.payload_format_indicator",
             "mqtt.var_header.properties.request_response_information", "mqtt.var_header.properties.response_topic",
             "mqtt.var_header.properties.session_expiry_interval", "mqtt.var_header.properties.subscription_identifier",
             "mqtt.var_header.properties.topic_alias", "mqtt.var_header.properties.user_property",
             "mqtt.var_header.properties.will_delay_interval", "mqtt.var_header.properties.will_payload_format_indicator",
             "mqtt.var_header.properties.will_content_type", "mqtt.var_header.properties.will_response_topic",
             "mqtt.var_header.properties.will_correlation_data", "mqtt.var_header.properties.will_user_property",
             "mqtt.var_header.properties.will_subscription_identifier", "mqtt.var_header.properties.will_topic_alias",
             "mqtt.var_header.properties.will_retained_message", "mqtt.var_header.properties.will_message_expiry_interval",
             "mqtt.var_header.properties.will_content_type_len", "mqtt.var_header.properties.will_content_type_val",
             "mqtt.var_header.properties.will_response_topic_len", "mqtt.var_header.properties.will_response_topic_val",
             "mqtt.var_header.properties.will_correlation_data_len", "mqtt.var_header.properties.will_correlation_data_val",
             "mqtt.var_header.properties.will_user_property_len", "mqtt.var_header.properties.will_user_property_val",
             "mqtt.var_header.properties.will_subscription_identifier_len", "mqtt.var_header.properties.will_subscription_identifier_val",
             "mqtt.var_header.properties.will_topic_alias_len", "mqtt.var_header.properties.will_topic_alias_val",
             "mqtt.var_header.properties.will_retained_message_len", "mqtt.var_header.properties.will_retained_message_val",
             "mqtt.var_header.properties.will_message_expiry_interval_len", "mqtt.var_header.properties.will_message_expiry_interval_val"
             ]
drop_cols = sorted(list(set(drop_cols)))


if "Attack_label" in df.columns:
    y = df["Attack_label"]
    cols_to_drop_for_X = ["Attack_label", "Attack_type", "Label"] + [col for col in drop_cols if col in df.columns]
    cols_to_drop_for_X = sorted(list(set(cols_to_drop_for_X)))
    X_candidate_features = df.drop(columns=cols_to_drop_for_X, errors="ignore")
    X = X_candidate_features.select_dtypes(include=np.number)
else:
    print("Target variable Attack_label not found.")
    exit()

if X.empty:
    print("No numeric features available after selection. Exiting.")
    exit()

print(f"Selected features for training: {X.columns.tolist()}")
print(f"Number of selected features: {len(X.columns)}")


# Data Splitting
if y.empty or len(y.unique()) < 2:
    print("Target variable y is empty or has only one class. Stratified splitting not possible.")
    exit()

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=(0.15/0.85), random_state=42, stratify=y_train_val)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# --- Save Scaler in all formats ---
# Joblib
joblib.dump(scaler, SCALER_SAVE_PATH_JOBLIB)
print(f"Scaler saved to {SCALER_SAVE_PATH_JOBLIB} (joblib format)")
# Pickle
with open(SCALER_SAVE_PATH_PKL, 'wb') as f:
    pickle.dump(scaler, f)
print(f"Scaler saved to {SCALER_SAVE_PATH_PKL} (pickle .pkl format)")


print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Initialize and train SVM Classifier
print("Training SVM model...")
svm_model = SVC(kernel="rbf", C=1, gamma="scale", random_state=42, probability=True)
svm_model.fit(X_train, y_train)
print("SVM Model training complete.")

# --- Save Model in all formats ---
# Joblib
joblib.dump(svm_model, MODEL_SAVE_PATH_JOBLIB)
print(f"Trained SVM model saved to {MODEL_SAVE_PATH_JOBLIB} (joblib format)")
# Pickle
with open(MODEL_SAVE_PATH_PKL, 'wb') as f:
    pickle.dump(svm_model, f)
print(f"Trained SVM model saved to {MODEL_SAVE_PATH_PKL} (pickle .pkl format)")

# HDF5 (containing pickled model and scaler)
print(f"Saving SVM model and scaler to HDF5 format: {MODEL_SAVE_PATH_H5}...")
try:
    model_bytes = pickle.dumps(svm_model)
    scaler_bytes = pickle.dumps(scaler) # Already done above, but for clarity in this block

    with h5py.File(MODEL_SAVE_PATH_H5, 'w') as h5f:
        h5f.create_dataset('svm_model', data=np.void(model_bytes))
        h5f.create_dataset('scaler', data=np.void(scaler_bytes))
        h5f.attrs['feature_names'] = pickle.dumps(X.columns.tolist())

    print(f"SVM model and scaler successfully saved to {MODEL_SAVE_PATH_H5}")
except Exception as e:
    print(f"Error saving to HDF5: {e}")


# Evaluate on validation set
print("Evaluating SVM on validation set...")
y_val_pred_svm = svm_model.predict(X_val)
val_accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
print(f"SVM Validation Accuracy: {val_accuracy_svm:.4f}")
print("SVM Validation Classification Report:")
print(classification_report(y_val, y_val_pred_svm, zero_division=0))

# Evaluate on test set
print("Evaluating SVM on test set...")
y_test_pred_svm = svm_model.predict(X_test)
test_accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
print(f"SVM Test Accuracy: {test_accuracy_svm:.4f}")
print("SVM Test Classification Report:")
print(classification_report(y_test, y_test_pred_svm, zero_division=0))

# Save the results
results_summary_svm = f"SVM Model Results:\n"
results_summary_svm += f"Models saved to:\n"
results_summary_svm += f"  Joblib (model): {MODEL_SAVE_PATH_JOBLIB}\n"
results_summary_svm += f"  Joblib (scaler): {SCALER_SAVE_PATH_JOBLIB}\n"
results_summary_svm += f"  Pickle (model): {MODEL_SAVE_PATH_PKL}\n"
results_summary_svm += f"  Pickle (scaler): {SCALER_SAVE_PATH_PKL}\n"
results_summary_svm += f"  HDF5 (model & scaler pickled bytes): {MODEL_SAVE_PATH_H5}\n\n"
results_summary_svm += f"Validation Accuracy: {val_accuracy_svm:.4f}\n"
results_summary_svm += f"Test Accuracy: {test_accuracy_svm:.4f}\n\n"
results_summary_svm += f"Validation Classification Report:\n{classification_report(y_val, y_val_pred_svm, zero_division=0)}\n\n"
results_summary_svm += f"Test Classification Report:\n{classification_report(y_test, y_test_pred_svm, zero_division=0)}\n"

output_file_path_svm = os.path.join(OUTPUT_PATH, "svm_results.txt")
with open(output_file_path_svm, "w") as f:
    f.write(results_summary_svm)

print(f"SVM Results saved to {output_file_path_svm}")
print("SVM script finished successfully.")

Loading dataset: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv
Dataset loaded successfully.
Dataset shape after dropping NA: (157800, 63)
Selected features for training: ['arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack', 'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as', 'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']
Number of selected features: 27
Scaler saved to /content/drive/MyDrive/Colab Notebooks/results/svm_scaler.joblib (joblib format)
Scaler saved to /content/drive/MyDrive/Colab Notebooks/results/svm_scaler.pkl (pickle .pkl format)
Training set size: 110460
Validation set size: 23670
Test set size: 23670