In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import sys
import json
import time
import pickle
import argparse
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc,
    precision_recall_curve, average_precision_score
)
# For TensorFlow/Keras models
import tensorflow as tf
# Ensure load_model is correctly imported from tf.keras.models
from tensorflow.keras.models import load_model as tf_load_model

# For system resource monitoring
import psutil
import gc

# --- ACTION REQUIRED: Ensure Google Drive is mounted if you want to use it ---
# If you haven't run the mount command in a separate cell, do it now:
# from google.colab import drive
# drive.mount('/content/drive')

# --- ACTION REQUIRED: Define the MAIN Google Drive path for ALL your IDS-AI projects ---
# All specific project BASE_DIRs will be created inside this path.
# Example: "/content/drive/MyDrive/My_Overall_IDS_AI_Projects_Folder"
YOUR_MAIN_GOOGLE_DRIVE_PROJECT_FOLDER = "/content/drive/MyDrive/IDS_AI_Suite" # <<<< CHANGE THIS TO YOUR DESIRED GOOGLE DRIVE FOLDER

# --- Define project base directory for THIS BENCHMARKING SCRIPT ---
# This will be a subfolder within your YOUR_MAIN_GOOGLE_DRIVE_PROJECT_FOLDER
BENCHMARKING_PROJECT_FOLDER_NAME = "model_benchmarking_outputs"
BASE_DIR = os.path.join(YOUR_MAIN_GOOGLE_DRIVE_PROJECT_FOLDER, BENCHMARKING_PROJECT_FOLDER_NAME)

if "/content/drive/" not in YOUR_MAIN_GOOGLE_DRIVE_PROJECT_FOLDER:
    print(f"⚠️ WARNING: YOUR_MAIN_GOOGLE_DRIVE_PROJECT_FOLDER ('{YOUR_MAIN_GOOGLE_DRIVE_PROJECT_FOLDER}') "
          "does not appear to be a Google Drive path. Please ensure Drive is mounted and the path is correct.")
    print(f"   Results might be saved to a temporary Colab path if not set correctly: /content/{BENCHMARKING_PROJECT_FOLDER_NAME}")
    # Fallback to a temporary path if Drive path seems incorrect or not mounted.
    if not os.path.exists(os.path.join("/content/drive/MyDrive")): # Basic check if /content/drive/MyDrive exists
        print("   Google Drive ('/content/drive/MyDrive') not detected. Using temporary storage.")
        BASE_DIR = f"/content/{BENCHMARKING_PROJECT_FOLDER_NAME}"

print(f"Attempting to set project base for benchmarking at: {BASE_DIR}")
os.makedirs(BASE_DIR, exist_ok=True) # Ensure the main project-specific folder is created

# Define subdirectories
DATA_DIR = os.path.join(BASE_DIR, "data_for_benchmarking")
MODEL_DIR = os.path.join(BASE_DIR, "models_to_benchmark")
RESULTS_DIR = os.path.join(BASE_DIR, "benchmark_outputs")
PLOTS_DIR = os.path.join(RESULTS_DIR, "plots")

for directory in [DATA_DIR, MODEL_DIR, RESULTS_DIR, PLOTS_DIR]:
    os.makedirs(directory, exist_ok=True)

print(f"\nProject base directory for Benchmarking: {BASE_DIR}")
print(f"Data directory (for test data): {DATA_DIR}")
print(f"Models directory (for models to benchmark): {MODEL_DIR}")
print(f"Benchmark results directory: {RESULTS_DIR}")
print(f"Plots directory: {PLOTS_DIR}")

# Alias tf.keras.models.load_model
keras_load_model = tf_load_model

print("\n✅ Section 1 (Benchmarking - Setup and Configuration) is ready.")

Attempting to set project base for benchmarking at: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs

Project base directory for Benchmarking: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs
Data directory (for test data): /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/data_for_benchmarking
Models directory (for models to benchmark): /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/models_to_benchmark
Benchmark results directory: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs
Plots directory: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/plots

✅ Section 1 (Benchmarking - Setup and Configuration) is ready.


In [15]:
# Imports from Section 1 should still be in effect.
# Ensure sklearn.preprocessing.LabelEncoder, StandardScaler are available
# Ensure sklearn.model_selection.train_test_split is available

class ModelBenchmarker:
    """
    Class for benchmarking and comparing different machine learning models
    for intrusion detection.
    """

    def __init__(self):
        self.models = {}
        self.results = {}
        self.test_data = None
        self.X_columns = None
        self.y_column = None
        # print("ModelBenchmarker Initialized.")

    def load_model(self, model_path, model_name=None):
        if not os.path.exists(model_path):
            print(f"❌ Error: Model file not found at {model_path}")
            return False
        if model_name is None:
            model_name = os.path.basename(model_path).split('.')[0]

        try:
            model_instance = None; model_type = 'unknown'
            if model_path.endswith('.h5'):
                model_instance = keras_load_model(model_path)
                model_type = 'keras'
            elif model_path.endswith(('.pkl', '.joblib')):
                with open(model_path, 'rb') as f: loaded_object = pickle.load(f)
                if isinstance(loaded_object, dict) and 'model' in loaded_object:
                    model_instance = loaded_object['model']
                    model_type = 'sklearn_dict' if hasattr(model_instance, 'predict') else 'unknown_pickle_dict'
                else:
                    model_instance = loaded_object
                    model_type = 'sklearn' if hasattr(model_instance, 'predict') else 'unknown_pickle_object'
            else:
                with open(model_path, 'rb') as f: model_instance = pickle.load(f)
                model_type = 'sklearn_generic' if hasattr(model_instance, 'predict') else 'unknown_generic'

            if model_instance is None: print(f"❌ Failed to load model instance from {model_path}."); return False
            self.models[model_name] = {'model': model_instance, 'path': model_path, 'type': model_type}
            # print(f"✅ Successfully loaded model '{model_name}' (Type: {model_type}).") # Less verbose
            return True
        except Exception as e:
            print(f"❌ Error loading model '{model_name}' from {model_path}: {e}"); return False

    def load_test_data(self, data_path, X_columns=None, y_column=None,
                       test_only=True, # Defaulting to True as it's for benchmarking
                       test_size_if_split=0.2,
                       scaler_path=None,
                       reshape_for_lstm_load_time=False): # Renamed to avoid confusion
        """
        Load and preprocess test data for benchmarking.
        If test_only is True, uses the whole file as test data.
        If False, splits data and uses the test portion.
        Applies scaling if scaler_path provided. Encodes target.
        Optionally reshapes X_test if reshape_for_lstm_load_time is True.
        """
        print(f"\nAttempting to load test data from: {data_path}")
        try:
            if data_path.endswith('.csv'): data_df_raw = pd.read_csv(data_path, low_memory=False)
            elif data_path.endswith(('.json', '.jsonl')): data_df_raw = pd.read_json(data_path, lines=data_path.endswith('.jsonl'))
            else: print(f"❌ Unsupported test data file format: {data_path}"); return False

            data_df = data_df_raw.copy() # Work on a copy
            print(f"Raw test data loaded. Shape: {data_df.shape}")
            data_df.replace([np.inf, -np.inf], np.nan, inplace=True)

            # Determine y_column and X_columns before dropna to ensure they exist for subsetting
            if y_column is None: y_column = data_df.columns[-1]
            if X_columns is None: X_columns = [col for col in data_df.columns if col != y_column]

            cols_to_check_dropna = [col for col in X_columns if col in data_df.columns] + \
                                   ([y_column] if y_column in data_df.columns else [])
            if not cols_to_check_dropna: print("Warning: No valid X or Y columns for dropna check based on initial spec."); return False

            data_df.dropna(subset=cols_to_check_dropna, inplace=True)
            print(f"Test data shape after NaN drop (on relevant columns): {data_df.shape}")
            if data_df.empty: print("❌ Test data is empty after cleaning NaNs."); return False

            self.y_column = y_column
            self.X_columns = X_columns # Store original intended feature columns

            current_data_for_processing = data_df
            if not test_only:
                if len(data_df[self.y_column].unique()) > 1 and len(data_df) > 1 / test_size_if_split : # Check for stratification possibility
                     _, current_data_for_processing = train_test_split(data_df, test_size=test_size_if_split, random_state=42, stratify=data_df[self.y_column])
                else: # Fallback if stratification not possible or not enough data
                     _, current_data_for_processing = train_test_split(data_df, test_size=test_size_if_split, random_state=42)
                print(f"Data split, using test portion of size: {current_data_for_processing.shape[0]}")

            y_test_raw = current_data_for_processing[self.y_column]

            missing_X_cols = [col for col in self.X_columns if col not in current_data_for_processing.columns]
            if missing_X_cols: print(f"❌ Specified feature columns not found in data to be processed: {missing_X_cols}"); return False
            X_test_raw_features = current_data_for_processing[self.X_columns].copy()

            X_test_numeric = X_test_raw_features.select_dtypes(include=np.number)
            non_numeric_cols = X_test_raw_features.select_dtypes(exclude=np.number).columns.tolist()
            if non_numeric_cols: print(f"⚠️ Warning: Non-numeric columns dropped from features: {non_numeric_cols}")
            if X_test_numeric.empty: print("❌ No numeric features available in X_test."); return False

            # Update self.X_columns to only reflect the numeric columns being processed further
            processed_X_columns = X_test_numeric.columns.tolist()
            X_test_processed = X_test_numeric.copy()

            if scaler_path and os.path.exists(scaler_path):
                try:
                    with open(scaler_path, 'rb') as f: scaler = pickle.load(f)
                    if hasattr(scaler, 'mean_') and scaler.mean_ is not None:
                         X_test_scaled_values = scaler.transform(X_test_processed) # Scale only numeric part
                         X_test_processed = pd.DataFrame(X_test_scaled_values, columns=X_test_processed.columns, index=X_test_processed.index)
                         print(f"✅ Test data features scaled using: {scaler_path}")
                    else: print(f"⚠️ Scaler from {scaler_path} not fitted. Using unscaled.")
                except Exception as e: print(f"❌ Error applying scaler from {scaler_path}: {e}. Using unscaled.")
            elif scaler_path: print(f"⚠️ Scaler not found at {scaler_path}. Using unscaled.")
            else: print("ℹ️ No scaler_path provided. Using unscaled numeric data.")

            label_encoder = LabelEncoder(); y_test_processed = label_encoder.fit_transform(y_test_raw)
            print(f"Target '{self.y_column}' label encoded. Classes: {list(label_encoder.classes_)}")

            if isinstance(X_test_processed, pd.DataFrame): X_test_final_np = X_test_processed.values
            else: X_test_final_np = X_test_processed
            if reshape_for_lstm_load_time:
                X_test_final_np = X_test_final_np.reshape((X_test_final_np.shape[0], 1, X_test_final_np.shape[1]))

            self.test_data = {'X': X_test_final_np, 'y': y_test_processed, 'label_encoder': label_encoder, 'final_X_columns': processed_X_columns }
            print(f"✅ Test data loaded & processed: {X_test_final_np.shape[0]} samples. Final X_test shape: {X_test_final_np.shape}")
            return True
        except Exception as e:
            print(f"❌ Error in load_test_data for {data_path}: {e}"); import traceback; traceback.print_exc(); self.test_data=None; return False

    # The preprocess_data method from your original script is not strictly needed if load_test_data handles it all.
    # If it was intended for a different purpose (e.g. re-applying scaler to already loaded raw X), it would need adjustment.
    # For now, I'm keeping it as in your original script:
    def preprocess_data(self, scaler=None, label_encoder=None): # This assumes self.test_data has raw X, y
        if self.test_data is None or 'X_raw_unscaled' not in self.test_data or 'y_raw_unencoded' not in self.test_data:
            print("preprocess_data: Raw data for X or y not found in self.test_data. " \
                  "Ensure load_test_data stores X_raw_unscaled and y_raw_unencoded if this method is used.")
            return self.test_data.get('X') if self.test_data else None, self.test_data.get('y') if self.test_data else None

        X_test = self.test_data['X_raw_unscaled']
        y_test = self.test_data['y_raw_unencoded']

        if scaler is not None: X_test = scaler.transform(X_test)
        if label_encoder is not None: y_test = label_encoder.transform(y_test)
        return X_test, y_test


# --- Test Block for ModelBenchmarker (Part 1: Init & Loading) ---
if __name__ == "__main__" and 'google.colab' in sys.modules:
    print("\n--- Testing ModelBenchmarker (Part 1: Init & Loading) ---")

    benchmarker = ModelBenchmarker()

    test_h5_model_path = "/content/drive/MyDrive/Colab Notebooks/results/lstm_model.h5"
    dummy_sklearn_model_path = os.path.join(MODEL_DIR if 'MODEL_DIR' in globals() else ".", "dummy_sklearn_model.pkl")

    if not os.path.exists(dummy_sklearn_model_path):
        try:
            from sklearn.linear_model import LogisticRegression
            dummy_model = LogisticRegression(); dummy_X_sk = np.array([[0,0],[1,1],[0,1],[1,0]]); dummy_y_sk = np.array([0,1,0,1])
            dummy_model.fit(dummy_X_sk, dummy_y_sk)
            with open(dummy_sklearn_model_path, 'wb') as f: pickle.dump(dummy_model, f)
            print(f"Dummy sklearn model created and saved to: {dummy_sklearn_model_path}")
        except Exception as e: print(f"Could not create dummy sklearn model for testing: {e}")

    if os.path.exists(test_h5_model_path): benchmarker.load_model(test_h5_model_path, model_name="LSTM_IDS_Model")
    else: print(f"⚠️ Test Keras model not found: {test_h5_model_path}")

    if os.path.exists(dummy_sklearn_model_path): benchmarker.load_model(dummy_sklearn_model_path, model_name="Dummy_Sklearn_Model")
    else: print(f"⚠️ Dummy sklearn model not found: {dummy_sklearn_model_path}")

    print(f"Models loaded: {list(benchmarker.models.keys())}")

    test_data_csv_path = "/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv"
    edge_iiot_features = [
        'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
        'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
        'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
        'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
        'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
        'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
        'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']
    edge_iiot_target = 'Attack_label'
    test_scaler_path = "/content/drive/MyDrive/Colab Notebooks/results/scaler.pkl"

    if os.path.exists(test_data_csv_path):
        benchmarker.load_test_data(
            data_path=test_data_csv_path,
            X_columns=edge_iiot_features,
            y_column=edge_iiot_target,
            scaler_path=test_scaler_path if os.path.exists(test_scaler_path) else None,
            test_only=True, # Use the whole CSV as test data
            reshape_for_lstm_load_time=False # Reshaping will be done in benchmark_model
        )
        if benchmarker.test_data:
            print(f"Test data X shape after load_test_data: {benchmarker.test_data['X'].shape}")
            print(f"Test data y shape after load_test_data: {benchmarker.test_data['y'].shape}")
            print(f"Final X_columns used by benchmarker: {benchmarker.X_columns}")
    else:
        print(f"⚠️ Test data file not found at {test_data_csv_path}.")
    print("\n--- End of ModelBenchmarker (Part 1) Test ---")

print("\n✅ Section 2 (Benchmarking - `ModelBenchmarker` Class - Part 1: Init & Loading) is ready.")




--- Testing ModelBenchmarker (Part 1: Init & Loading) ---
Models loaded: ['LSTM_IDS_Model', 'Dummy_Sklearn_Model']

Attempting to load test data from: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv
Raw test data loaded. Shape: (157800, 63)
Test data shape after NaN drop (on relevant columns): (157800, 63)
✅ Test data features scaled using: /content/drive/MyDrive/Colab Notebooks/results/scaler.pkl
Target 'Attack_label' label encoded. Classes: [np.int64(0), np.int64(1)]
✅ Test data loaded & processed: 157800 samples. Final X_test shape: (157800, 27)
Test data X shape after load_test_data: (157800, 27)
Test data y shape after load_test_data: (157800,)
Final X_columns used by benchmarker: ['arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port', 'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst', 'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack', 'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.q

In [18]:
# Imports from Section 1 should still be in effect.
# ModelBenchmarker class (Part 1) should be defined from the previous section.
# Ensure all necessary sklearn.metrics, psutil, gc, etc., are available.

class ModelBenchmarker:
    """
    Class for benchmarking and comparing different machine learning models
    for intrusion detection.
    (Includes methods from Part 1 and adds new methods from Part 2)
    """

    def __init__(self):
        self.models = {}
        self.results = {}
        self.test_data = None
        self.X_columns = None # Original X_columns list passed to load_test_data
        self.y_column = None  # Target column name
        # print("ModelBenchmarker Initialized.") # Verbose

    def load_model(self, model_path, model_name=None):
        if not os.path.exists(model_path):
            print(f"❌ Error: Model file not found at {model_path}"); return False
        if model_name is None: model_name = os.path.basename(model_path).split('.')[0]
        try:
            model_type = 'unknown'; model_instance = None
            if model_path.endswith('.h5'):
                model_instance = keras_load_model(model_path); model_type = 'keras'
            elif model_path.endswith(('.pkl', '.joblib')):
                with open(model_path, 'rb') as f: loaded_object = pickle.load(f)
                if isinstance(loaded_object, dict) and 'model' in loaded_object:
                    model_instance = loaded_object['model']
                    model_type = 'sklearn_dict' if hasattr(model_instance, 'predict') else 'unknown_pickle_dict'
                else:
                    model_instance = loaded_object
                    model_type = 'sklearn' if hasattr(model_instance, 'predict') else 'unknown_pickle_object'
            else:
                with open(model_path, 'rb') as f: model_instance = pickle.load(f)
                model_type = 'sklearn_generic' if hasattr(model_instance, 'predict') else 'unknown_generic'
            if model_instance is None: print(f"❌ Failed to load model instance from {model_path}."); return False
            self.models[model_name] = {'model': model_instance, 'path': model_path, 'type': model_type}
            # print(f"✅ Successfully loaded model '{model_name}' (Type: {model_type}).") # Verbose
            return True
        except Exception as e:
            print(f"❌ Error loading model '{model_name}' from {model_path}: {e}"); return False

    def load_test_data(self, data_path, X_columns=None, y_column=None,
                       test_only=True, test_size_if_split=0.2,
                       scaler_path=None, reshape_for_lstm_load_time=False): # reshape_for_lstm_load_time not used here, benchmark_model handles it
        try:
            if data_path.endswith('.csv'): data_df_raw = pd.read_csv(data_path, low_memory=False)
            elif data_path.endswith(('.json', '.jsonl')): data_df_raw = pd.read_json(data_path, lines=data_path.endswith('.jsonl'))
            else: print(f"❌ Unsupported test data file format: {data_path}"); return False

            data_df = data_df_raw.copy()
            cols_to_check_dropna = []
            if y_column is None: y_column = data_df.columns[-1]
            self.y_column = y_column
            if self.y_column not in data_df.columns: print(f"❌ Target column '{self.y_column}' not found."); return False
            cols_to_check_dropna.append(self.y_column)

            if X_columns is None: X_columns = [col for col in data_df.columns if col != self.y_column]
            self.X_columns = X_columns # Store original selected X feature names
            missing_X_cols = [col for col in self.X_columns if col not in data_df.columns]
            if missing_X_cols: print(f"❌ Specified features not in data: {missing_X_cols}"); return False
            cols_to_check_dropna.extend(self.X_columns)

            data_df.replace([np.inf, -np.inf], np.nan, inplace=True)
            data_df.dropna(subset=list(set(cols_to_check_dropna)), inplace=True)
            if data_df.empty: print("❌ Test data empty after NaN drop."); return False

            current_data_for_processing = data_df
            if not test_only:
                stratify_col = current_data_for_processing[self.y_column] if len(current_data_for_processing[self.y_column].unique()) > 1 else None
                _, current_data_for_processing = train_test_split(current_data_for_processing, test_size=test_size_if_split, random_state=42, stratify=stratify_col)

            y_test_raw = current_data_for_processing[self.y_column]
            X_test_raw_features = current_data_for_processing[self.X_columns].copy() # Select only specified X_columns

            X_test_numeric = X_test_raw_features.select_dtypes(include=np.number)
            non_numeric_cols = X_test_raw_features.select_dtypes(exclude=np.number).columns.tolist()
            if non_numeric_cols: print(f"⚠️ Warning: Non-numeric columns dropped from X_columns: {non_numeric_cols}")
            if X_test_numeric.empty: print("❌ No numeric features available in X_test after selection."); return False

            final_numeric_X_columns = X_test_numeric.columns.tolist() # Actual numeric columns being processed
            X_test_processed_df = X_test_numeric.copy() # Work with a DataFrame for now

            if scaler_path and os.path.exists(scaler_path):
                try:
                    with open(scaler_path, 'rb') as f: scaler = pickle.load(f)
                    if hasattr(scaler, 'mean_') and scaler.mean_ is not None:
                         X_test_scaled_values = scaler.transform(X_test_processed_df)
                         X_test_processed = pd.DataFrame(X_test_scaled_values, columns=X_test_processed_df.columns, index=X_test_processed_df.index)
                    # else: print(f"⚠️ Scaler from {scaler_path} not fitted. Using unscaled.") # Verbose
                except Exception as e: print(f"❌ Error applying scaler from {scaler_path}: {e}. Using unscaled.")
            # elif scaler_path: print(f"⚠️ Scaler not found at {scaler_path}. Using unscaled.") # Verbose
            # else: print("ℹ️ No scaler_path. Using unscaled numeric data.") # Verbose

            label_encoder = LabelEncoder(); y_test_processed = label_encoder.fit_transform(y_test_raw)

            # Store X as numpy array (2D), reshaping will be per-model in benchmark_model
            X_test_final_np = X_test_processed.values if isinstance(X_test_processed, pd.DataFrame) else np.array(X_test_processed)

            self.test_data = {'X': X_test_final_np, 'y': y_test_processed,
                              'label_encoder': label_encoder,
                              'feature_names': final_numeric_X_columns } # Store actual numeric feature names used
            # print(f"✅ Test data loaded & processed. Final X_test shape: {X_test_final_np.shape}") # Verbose
            return True
        except Exception as e:
            print(f"❌ Error in load_test_data: {e}"); self.test_data=None; return False

    # --- New methods for Section 3 ---
    def benchmark_model(self, model_name):
        if model_name not in self.models:
            print(f"❌ Model '{model_name}' not found in benchmarker."); return None
        if self.test_data is None or 'X' not in self.test_data or 'y' not in self.test_data:
            print(f"❌ Test data not properly loaded. Cannot benchmark '{model_name}'."); return None

        model_info = self.models[model_name]
        model = model_info['model']
        model_type = model_info['type']

        X_test_input = self.test_data['X']
        y_test_true = self.test_data['y']

        X_test_for_model = np.array(X_test_input).copy() # Ensure it's a NumPy array

        is_model_lstm_type = (model_type == 'keras' and hasattr(model, 'layers') and \
                              any(isinstance(layer, tf.keras.layers.LSTM) for layer in model.layers))

        # Reshape data if model is LSTM and current data is 2D
        if is_model_lstm_type and len(X_test_for_model.shape) == 2:
            num_samples, num_features = X_test_for_model.shape
            X_test_for_model = X_test_for_model.reshape(num_samples, 1, num_features)
        # If model is not LSTM but data is 3D (e.g. (samples, 1, features)), reshape to 2D
        elif not is_model_lstm_type and len(X_test_for_model.shape) == 3 and X_test_for_model.shape[1] == 1:
            X_test_for_model = X_test_for_model.reshape(X_test_for_model.shape[0], X_test_for_model.shape[2])

        print(f"\nBenchmarking '{model_name}' (Type: {model_type}) on {X_test_for_model.shape[0]} samples. Input X shape: {X_test_for_model.shape}")

        start_time = time.time(); gc.collect(); process = psutil.Process(os.getpid()); memory_before = process.memory_info().rss/(1024*1024)
        y_pred_classes, y_pred_proba = None, None

        try:
            if model_type == 'keras':
                y_pred_raw = model.predict(X_test_for_model, verbose=0)
                if len(y_pred_raw.shape) > 1 and y_pred_raw.shape[1] > 1:
                    y_pred_classes = np.argmax(y_pred_raw, axis=1); y_pred_proba = y_pred_raw
                else:
                    y_pred_classes = (y_pred_raw > 0.5).astype(int).ravel(); y_pred_proba = np.column_stack((1-y_pred_raw.ravel(), y_pred_raw.ravel()))
            elif model_type.startswith('sklearn'):
                y_pred_classes = model.predict(X_test_for_model)
                if hasattr(model, 'predict_proba'): y_pred_proba = model.predict_proba(X_test_for_model)
            else: print(f"Unsupported model type '{model_type}'."); return None
        except ValueError as ve: # Catch feature mismatch errors specifically
             print(f"❌ ValueError during prediction for '{model_name}': {ve}")
             print(f"   Model expected input based on its build: {model.input_shape if hasattr(model, 'input_shape') else 'N/A'}")
             print(f"   Data provided shape: {X_test_for_model.shape}")
             return None
        except Exception as e: print(f"❌ Error predicting for '{model_name}': {e}"); return None

        prediction_time=time.time()-start_time; gc.collect(); memory_after=process.memory_info().rss/(1024*1024); memory_used=memory_after-memory_before
        if y_test_true.ndim > 1 and y_test_true.shape[1]==1: y_test_true=y_test_true.ravel()
        if y_pred_classes.ndim > 1 and y_pred_classes.shape[1]==1: y_pred_classes=y_pred_classes.ravel()

        unique_labels_in_data = self.test_data['label_encoder'].classes_ # Use all classes known to encoder
        accuracy=accuracy_score(y_test_true,y_pred_classes)
        precision=precision_score(y_test_true,y_pred_classes,average='weighted',zero_division=0,labels=np.arange(len(unique_labels_in_data)))
        recall=recall_score(y_test_true,y_pred_classes,average='weighted',zero_division=0,labels=np.arange(len(unique_labels_in_data)))
        f1=f1_score(y_test_true,y_pred_classes,average='weighted',zero_division=0,labels=np.arange(len(unique_labels_in_data)))
        cm=confusion_matrix(y_test_true,y_pred_classes,labels=np.arange(len(unique_labels_in_data)))
        report=classification_report(y_test_true,y_pred_classes,output_dict=True,zero_division=0,labels=np.arange(len(unique_labels_in_data)), target_names=[str(cls) for cls in unique_labels_in_data])

        roc_auc_s, fpr_d, tpr_d = None, None, None
        if y_pred_proba is not None and len(unique_labels_in_data)==2 and y_pred_proba.ndim==2 and y_pred_proba.shape[1]==2:
             fpr_d,tpr_d,_=roc_curve(y_test_true,y_pred_proba[:,1]); roc_auc_s=auc(fpr_d,tpr_d)

        res = {'model_name':model_name,'model_type':model_type,'accuracy':float(accuracy),'precision':float(precision),'recall':float(recall),'f1_score':float(f1),'confusion_matrix':cm.tolist(),'classification_report':report,'prediction_time_total_s':float(prediction_time),'prediction_time_per_sample_ms':float(prediction_time/len(X_test_for_model)*1000) if len(X_test_for_model)>0 else 0,'memory_increase_mb':float(memory_used),'roc_auc':float(roc_auc_s) if roc_auc_s is not None else None,'fpr':fpr_d.tolist() if fpr_d is not None else None,'tpr':tpr_d.tolist() if tpr_d is not None else None,'timestamp':datetime.now().isoformat(),'num_test_samples':len(X_test_for_model)}
        self.results[model_name]=res; print(f"  Benchmarked '{model_name}': Acc={accuracy:.4f}, F1={f1:.4f}, Time/sample={res['prediction_time_per_sample_ms']:.2f}ms"); return res

    def benchmark_all_models(self):
        if not self.models: print("No models loaded to benchmark."); return {}
        if self.test_data is None: print("No test data loaded to benchmark on."); return {}
        print(f"\n--- Benchmarking All Loaded Models ({len(self.models)}) ---")
        for model_name_key in list(self.models.keys()): self.benchmark_model(model_name_key)
        print("--- All Models Benchmarked ---"); return self.results

# --- Test Block for ModelBenchmarker (Part 2: Benchmarking) ---
if __name__ == "__main__" and 'google.colab' in sys.modules:
    print("\n--- Testing ModelBenchmarker (Part 2: Benchmarking) ---")

    # Ensure benchmarker instance exists from Section 2, or re-initialize if this cell is run alone
    if 'benchmarker' not in globals() or not isinstance(benchmarker, ModelBenchmarker) or \
       benchmarker.test_data is None or not benchmarker.models: # Check if data/models are loaded
        print("⚠️ ModelBenchmarker instance not found, or models/data not loaded from Section 2. " \
              "Re-initializing and attempting to load for this test.")
        benchmarker = ModelBenchmarker()

        test_h5_model_path_s3 = "/content/drive/MyDrive/Colab Notebooks/results/lstm_model.h5"
        if os.path.exists(test_h5_model_path_s3):
            benchmarker.load_model(test_h5_model_path_s3, model_name="LSTM_IDS_Model")
        else: print(f"⚠️ Test Keras model for benchmark test not found: {test_h5_model_path_s3}")

        dummy_sklearn_model_path_s3 = os.path.join(MODEL_DIR if 'MODEL_DIR' in globals() else ".", "dummy_sklearn_model.pkl")
        if os.path.exists(dummy_sklearn_model_path_s3):
            benchmarker.load_model(dummy_sklearn_model_path_s3, model_name="Dummy_Sklearn_Model")
        else:
            try:
                from sklearn.linear_model import LogisticRegression
                temp_model_sk = LogisticRegression(); temp_model_sk.fit(np.random.rand(10,2), np.random.randint(0,2,10))
                os.makedirs(os.path.dirname(dummy_sklearn_model_path_s3), exist_ok=True)
                with open(dummy_sklearn_model_path_s3, 'wb') as f: pickle.dump(temp_model_sk, f)
                benchmarker.load_model(dummy_sklearn_model_path_s3, model_name="Dummy_Sklearn_Model")
            except Exception as e_sk_create: print(f"Error creating dummy sklearn for test: {e_sk_create}")

        test_data_csv_path_s3 = "/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv"
        test_scaler_path_s3 = "/content/drive/MyDrive/Colab Notebooks/results/scaler.pkl"
        edge_iiot_features_s3 = [
            'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
            'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
            'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
            'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
            'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
            'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
            'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id']
        edge_iiot_target_s3 = 'Attack_label'
        if os.path.exists(test_data_csv_path_s3):
            print("Re-loading EdgeIIoT test data for benchmarking...")
            benchmarker.load_test_data(
                test_data_csv_path_s3, X_columns=edge_iiot_features_s3, y_column=edge_iiot_target_s3,
                scaler_path=test_scaler_path_s3 if os.path.exists(test_scaler_path_s3) else None,
                test_only=True, reshape_for_lstm_load_time=False
            )
        else:
            print(f"⚠️ EdgeIIoT Test data CSV ({test_data_csv_path_s3}) not found for benchmarking.")

    # Proceed to benchmark only if models AND data are successfully loaded into the instance
    if benchmarker.models and benchmarker.test_data:
        all_benchmark_results = benchmarker.benchmark_all_models()
        if all_benchmark_results:
            print("\n--- Overall Benchmark Summary (from test block) ---")
            for model_name_res, res_data in all_benchmark_results.items():
                print(f"  Model: {model_name_res}, Accuracy: {res_data['accuracy']:.4f}, F1-score: {res_data['f1_score']:.4f}")
    else:
        print("⚠️ Cannot run benchmarks: Models or test data not loaded into the benchmarker instance.")
        if not benchmarker.models: print("   - No models found in benchmarker.")
        if not benchmarker.test_data: print("   - No test data (self.test_data) found in benchmarker.")

    print("\n--- End of ModelBenchmarker (Part 2) Test ---")

print("\n✅ Section 3 (Benchmarking - `ModelBenchmarker` Class - Part 2: Benchmarking) is ready.")




--- Testing ModelBenchmarker (Part 2: Benchmarking) ---
⚠️ ModelBenchmarker instance not found, or models/data not loaded from Section 2. Re-initializing and attempting to load for this test.
Re-loading EdgeIIoT test data for benchmarking...

--- Benchmarking All Loaded Models (2) ---

Benchmarking 'LSTM_IDS_Model' (Type: keras) on 157800 samples. Input X shape: (157800, 1, 27)
  Benchmarked 'LSTM_IDS_Model': Acc=0.9729, F1=0.9727, Time/sample=0.13ms

Benchmarking 'Dummy_Sklearn_Model' (Type: sklearn) on 157800 samples. Input X shape: (157800, 27)
❌ ValueError during prediction for 'Dummy_Sklearn_Model': X has 27 features, but LogisticRegression is expecting 2 features as input.
   Model expected input based on its build: N/A
   Data provided shape: (157800, 27)
--- All Models Benchmarked ---

--- Overall Benchmark Summary (from test block) ---
  Model: LSTM_IDS_Model, Accuracy: 0.9729, F1-score: 0.9727

--- End of ModelBenchmarker (Part 2) Test ---

✅ Section 3 (Benchmarking - `Model

In [19]:
# Imports from Section 1 should still be in effect.
# ModelBenchmarker class (Parts 1 & 2) should be defined from previous sections.
# Ensure matplotlib.pyplot as plt and seaborn as sns are imported.

class ModelBenchmarker:
    """
    Class for benchmarking and comparing different machine learning models
    for intrusion detection.
    (Includes methods from Part 1 & 2, and adds new methods from Part 3)
    """

    def __init__(self):
        self.models = {}
        self.results = {}
        self.test_data = None
        self.X_columns = None
        self.y_column = None
        # print("ModelBenchmarker Initialized.")

    def load_model(self, model_path, model_name=None):
        if not os.path.exists(model_path):
            print(f"❌ Error: Model file not found at {model_path}"); return False
        if model_name is None: model_name = os.path.basename(model_path).split('.')[0]
        try:
            model_type = 'unknown'; model_instance = None
            if model_path.endswith('.h5'):
                model_instance = keras_load_model(model_path); model_type = 'keras'
            elif model_path.endswith(('.pkl', '.joblib')):
                with open(model_path, 'rb') as f: loaded_object = pickle.load(f)
                if isinstance(loaded_object, dict) and 'model' in loaded_object: model_instance = loaded_object['model']
                else: model_instance = loaded_object
                model_type = 'sklearn' if hasattr(model_instance, 'predict') else 'unknown_pickle'
            else:
                with open(model_path, 'rb') as f: model_instance = pickle.load(f)
                model_type = 'sklearn_generic' if hasattr(model_instance, 'predict') else 'unknown_generic'
            if model_instance is None: print(f"❌ Failed to load model instance from {model_path}."); return False
            self.models[model_name] = {'model': model_instance, 'path': model_path, 'type': model_type}
            # print(f"✅ Successfully loaded model '{model_name}' (Type: {model_type}).")
            return True
        except Exception as e:
            print(f"❌ Error loading model '{model_name}' from {model_path}: {e}"); return False

    def load_test_data(self, data_path, X_columns=None, y_column=None, preprocess_for_lstm=False, scaler_path=None):
        try:
            if data_path.endswith('.csv'): data_df = pd.read_csv(data_path, low_memory=False)
            elif data_path.endswith(('.json', '.jsonl')): data_df = pd.read_json(data_path, lines=data_path.endswith('.jsonl'))
            else: print(f"❌ Unsupported test data format: {data_path}"); return False
            data_df.replace([np.inf, -np.inf], np.nan, inplace=True); data_df.dropna(inplace=True)
            if data_df.empty: print("❌ Test data empty after NaN drop."); return False
            if y_column is None: y_column = data_df.columns[-1]
            self.y_column = y_column
            if self.y_column not in data_df.columns: print(f"❌ Target '{self.y_column}' not in test data."); return False
            y_test_raw = data_df[self.y_column]
            if X_columns is None: X_columns = [col for col in data_df.columns if col != self.y_column]
            self.X_columns = X_columns
            missing_X_cols = [col for col in self.X_columns if col not in data_df.columns]
            if missing_X_cols: print(f"❌ Features not in test data: {missing_X_cols}"); return False
            X_test_raw = data_df[self.X_columns].copy()
            X_test_numeric = X_test_raw.select_dtypes(include=np.number)
            if X_test_numeric.empty: print("❌ No numeric features in X_test."); return False
            X_test_processed = X_test_numeric.copy()
            if X_test_numeric.shape[1] < len(self.X_columns):
                self.X_columns = X_test_numeric.columns.tolist()
            if scaler_path and os.path.exists(scaler_path):
                try:
                    with open(scaler_path, 'rb') as f: scaler = pickle.load(f)
                    if hasattr(scaler, 'mean_') and scaler.mean_ is not None:
                         X_test_processed_scaled = scaler.transform(X_test_numeric)
                         X_test_processed = pd.DataFrame(X_test_processed_scaled, columns=X_test_numeric.columns, index=X_test_numeric.index)
                    else: print(f"⚠️ Scaler from {scaler_path} not fitted. Using unscaled.")
                except Exception as e: print(f"❌ Error applying scaler from {scaler_path}: {e}. Using unscaled.")
            elif scaler_path: print(f"⚠️ Scaler not found at {scaler_path}. Using unscaled.")
            label_encoder = LabelEncoder(); y_test_processed = label_encoder.fit_transform(y_test_raw)
            if preprocess_for_lstm:
                if isinstance(X_test_processed, pd.DataFrame): X_test_processed_values = X_test_processed.values
                else: X_test_processed_values = X_test_processed
                X_test_processed = X_test_processed_values.reshape((X_test_processed_values.shape[0], 1, X_test_processed_values.shape[1]))
            self.test_data = {'X': X_test_processed, 'y': y_test_processed, 'label_encoder': label_encoder, 'original_X_columns': self.X_columns }
            return True
        except Exception as e:
            print(f"❌ Error loading/processing test data from {data_path}: {e}"); self.test_data=None; return False

    def benchmark_model(self, model_name):
        if model_name not in self.models: print(f"❌ Model '{model_name}' not found."); return None
        if self.test_data is None or not isinstance(self.test_data, dict) or \
           'X' not in self.test_data or 'y' not in self.test_data:
            print(f"❌ Test data not properly loaded for '{model_name}'."); return None
        model_info=self.models[model_name]; model=model_info['model']; model_type=model_info['type']
        X_test_input = self.test_data['X']; y_test_input = self.test_data['y']
        if isinstance(X_test_input, pd.DataFrame): X_test_for_model = X_test_input.values
        else: X_test_for_model = np.array(X_test_input).copy()
        is_model_lstm_type = (model_type == 'keras' and hasattr(model, 'layers') and any(isinstance(layer, tf.keras.layers.LSTM) for layer in model.layers))
        if is_model_lstm_type and len(X_test_for_model.shape) == 2:
            X_test_for_model = X_test_for_model.reshape(X_test_for_model.shape[0], 1, X_test_for_model.shape[1])
        elif not is_model_lstm_type and len(X_test_for_model.shape) == 3:
            X_test_for_model = X_test_for_model.reshape(X_test_for_model.shape[0], -1)
        start_time=time.time(); gc.collect(); process=psutil.Process(os.getpid()); memory_before=process.memory_info().rss/(1024*1024)
        y_pred_classes, y_pred_proba = None, None
        try:
            if model_type == 'keras':
                y_pred_raw = model.predict(X_test_for_model, verbose=0)
                if len(y_pred_raw.shape) > 1 and y_pred_raw.shape[1] > 1: y_pred_classes = np.argmax(y_pred_raw, axis=1); y_pred_proba = y_pred_raw
                else: y_pred_classes = (y_pred_raw > 0.5).astype(int).flatten(); y_pred_proba = np.column_stack((1-y_pred_raw.flatten(), y_pred_raw.flatten()))
            elif model_type.startswith('sklearn'):
                y_pred_classes = model.predict(X_test_for_model)
                if hasattr(model, 'predict_proba'): y_pred_proba = model.predict_proba(X_test_for_model)
            else: return None
        except Exception as e: print(f"❌ Error predicting for '{model_name}': {e}"); return None
        prediction_time=time.time()-start_time; gc.collect(); memory_after=process.memory_info().rss/(1024*1024); memory_used=memory_after-memory_before
        if y_test_input.ndim > 1 and y_test_input.shape[1]==1: y_test_input=y_test_input.ravel()
        if y_pred_classes.ndim > 1 and y_pred_classes.shape[1]==1: y_pred_classes=y_pred_classes.ravel()
        unique_labels = np.union1d(np.unique(y_test_input), np.unique(y_pred_classes))
        accuracy=accuracy_score(y_test_input,y_pred_classes); precision=precision_score(y_test_input,y_pred_classes,average='weighted',zero_division=0,labels=unique_labels if len(unique_labels)>0 else None); recall=recall_score(y_test_input,y_pred_classes,average='weighted',zero_division=0,labels=unique_labels if len(unique_labels)>0 else None); f1=f1_score(y_test_input,y_pred_classes,average='weighted',zero_division=0,labels=unique_labels if len(unique_labels)>0 else None)
        cm=confusion_matrix(y_test_input,y_pred_classes,labels=unique_labels if len(unique_labels)>0 else None); report=classification_report(y_test_input,y_pred_classes,output_dict=True,zero_division=0,labels=unique_labels if len(unique_labels)>0 else None)
        roc_auc_s, fpr_d, tpr_d = None, None, None
        if y_pred_proba is not None and len(np.unique(y_test_input))==2 and y_pred_proba.ndim==2 and y_pred_proba.shape[1]==2:
             fpr_d,tpr_d,_=roc_curve(y_test_input,y_pred_proba[:,1]); roc_auc_s=auc(fpr_d,tpr_d)
        res = {'model_name':model_name,'model_type':model_type,'accuracy':float(accuracy),'precision':float(precision),'recall':float(recall),'f1_score':float(f1),'confusion_matrix':cm.tolist(),'classification_report':report,'prediction_time_total_s':float(prediction_time),'prediction_time_per_sample_ms':float(prediction_time/len(X_test_for_model)*1000) if len(X_test_for_model)>0 else 0,'memory_increase_mb':float(memory_used),'roc_auc':float(roc_auc_s) if roc_auc_s is not None else None,'fpr':fpr_d.tolist() if fpr_d is not None else None,'tpr':tpr_d.tolist() if tpr_d is not None else None,'timestamp':datetime.now().isoformat(),'num_test_samples':len(X_test_for_model)}
        self.results[model_name]=res; # print(f"  Benchmarked '{model_name}': Acc={accuracy:.4f}, F1={f1:.4f}") # Verbose
        return res

    def benchmark_all_models(self):
        if not self.models: print("No models loaded."); return {}
        if self.test_data is None: print("No test data loaded."); return {}
        # print(f"\n--- Benchmarking All Loaded Models ({len(self.models)}) ---") # Verbose
        for model_name in self.models.keys(): self.benchmark_model(model_name)
        # print("--- All Models Benchmarked ---"); # Verbose
        return self.results

    def compare_models(self):
        if not self.results: print("No benchmark results to compare."); return None
        metrics_to_compare = ['accuracy', 'precision', 'recall', 'f1_score', 'prediction_time_per_sample_ms', 'memory_increase_mb', 'roc_auc']
        comparison_data = {metric: {} for metric in metrics_to_compare}
        best_models_summary = {metric: {'model': None, 'value': None} for metric in metrics_to_compare}
        for model_name, result in self.results.items():
            for metric in metrics_to_compare:
                value = result.get(metric);
                if value is not None : comparison_data[metric][model_name] = value
        for metric in metrics_to_compare:
            if not comparison_data[metric]: continue
            if metric in ['prediction_time_per_sample_ms', 'memory_increase_mb']: best_model_name = min(comparison_data[metric], key=comparison_data[metric].get)
            else: best_model_name = max(comparison_data[metric], key=comparison_data[metric].get)
            best_models_summary[metric]['model'] = best_model_name; best_models_summary[metric]['value'] = comparison_data[metric][best_model_name]
        return {'metrics_data': comparison_data, 'best_models_summary': best_models_summary, 'timestamp': datetime.now().isoformat()}

    def plot_comparison(self, metrics_to_plot=None, save_path=None, title_suffix=""):
        if not self.results: print("No results to plot."); return None
        if metrics_to_plot is None: metrics_to_plot = ['accuracy', 'f1_score']
        df_data = []; valid_metrics_plotted = []
        for model_name, res in self.results.items():
            row = {'Model': model_name}
            for metric in metrics_to_plot:
                if res.get(metric) is not None: row[metric.replace('_', ' ').title()] = res.get(metric);
                if metric not in valid_metrics_plotted and res.get(metric) is not None : valid_metrics_plotted.append(metric) # Track successfully plotted metrics
            if len(row) > 1: df_data.append(row)
        if not df_data: print(f"No data to plot for metrics: {metrics_to_plot}"); return None

        plot_df = pd.DataFrame(df_data).set_index('Model')
        # Use only columns that actually have data for plotting (derived from valid_metrics_plotted)
        valid_metric_titles = [m.replace('_',' ').title() for m in valid_metrics_plotted]
        plot_df = plot_df[[col for col in valid_metric_titles if col in plot_df.columns]] # Ensure column order and existence
        plot_df = plot_df.dropna(axis=1, how='all')

        if plot_df.empty: print(f"Not enough valid data for comparison plot for metrics: {valid_metrics_plotted}"); return None

        plot_df.plot(kind='bar', figsize=(8 + len(plot_df.index)*0.5, 6), rot=30, width=0.8) # Dynamic width
        plt.title(f"Model Comparison: {', '.join(plot_df.columns)} {title_suffix}"); plt.ylabel("Score / Value"); plt.tight_layout();
        if save_path: plt.savefig(save_path); print(f"Plot saved to {save_path}"); plt.close(); return save_path
        else: plt.show(); return None

    def plot_confusion_matrices(self, save_dir=None):
        if not self.results: print("No results for CMs."); return None
        saved_paths = [] # Initialize saved_paths at the beginning of the method
        for model_name, res in self.results.items():
            cm = np.array(res.get('confusion_matrix', []))
            if cm.size == 0: continue
            plt.figure(figsize=(5,4)); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False); plt.title(f'CM: {model_name}'); plt.ylabel('True'); plt.xlabel('Predicted'); plt.tight_layout()
            if save_dir:
                path = os.path.join(save_dir, f"cm_{model_name}.png"); plt.savefig(path); saved_paths.append(path); plt.close()
            else: plt.show()
        return saved_paths if save_dir and saved_paths else None

    def plot_roc_curves(self, save_path=None):
        if not self.results: print("No results for ROC."); return None
        plt.figure(figsize=(8,6)); any_roc_plotted = False
        for model_name, res in self.results.items():
            fpr, tpr, roc_auc = res.get('fpr'), res.get('tpr'), res.get('roc_auc')
            if fpr is not None and tpr is not None and roc_auc is not None:
                plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.3f})'); any_roc_plotted = True
        if not any_roc_plotted: print("No ROC curve data available in results."); plt.close(); return None
        plt.plot([0,1],[0,1],'k--'); plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves'); plt.legend(loc='lower right'); plt.tight_layout()
        if save_path: plt.savefig(save_path); print(f"ROC plot saved: {save_path}"); plt.close(); return save_path
        else: plt.show(); return None

    def plot_precision_recall_curves(self, save_dir=None):
        print("plot_precision_recall_curves: Needs y_scores. Placeholder.")
        return None

    def plot_latency_comparison(self, save_path=None):
        return self.plot_comparison(metrics_to_plot=['prediction_time_per_sample_ms'], save_path=save_path, title_suffix="- Latency (ms/sample)")

    def plot_memory_comparison(self, save_path=None):
        return self.plot_comparison(metrics_to_plot=['memory_increase_mb'], save_path=save_path, title_suffix="- Memory Increase (MB)")

# --- Test Block for ModelBenchmarker (Part 3: Plotting) ---
if __name__ == "__main__" and 'google.colab' in sys.modules:
    print("\n--- Testing ModelBenchmarker (Part 3: Plotting) ---")

    if 'benchmarker' not in globals() or not isinstance(benchmarker, ModelBenchmarker):
        print("⚠️ ModelBenchmarker instance not found from Section 2. Re-initializing for this test.")
        benchmarker = ModelBenchmarker()
        dummy_sklearn_model_path_s4 = os.path.join(MODEL_DIR if 'MODEL_DIR' in globals() else ".", "dummy_sklearn_model.pkl")
        if "Dummy_Sklearn_Model" not in benchmarker.models:
            if not os.path.exists(dummy_sklearn_model_path_s4):
                from sklearn.linear_model import LogisticRegression
                temp_model_sk = LogisticRegression(); temp_model_sk.fit(np.random.rand(20,2), np.random.randint(0,2,20))
                with open(dummy_sklearn_model_path_s4, 'wb') as f: pickle.dump(temp_model_sk, f)
            benchmarker.load_model(dummy_sklearn_model_path_s4, "Dummy_Sklearn_Model")

        if benchmarker.test_data is None:
            print("Plotting Test: No test data in benchmarker from Section 2. Creating minimal dummy data.")
            dummy_X_plot_df = pd.DataFrame(np.random.rand(50, 2), columns=['dummy_feat1', 'dummy_feat2'])
            dummy_y_plot_raw = pd.Series(np.random.choice(['Normal', 'Attack'], 50))
            le_plot = LabelEncoder()
            benchmarker.test_data = {
                'X': dummy_X_plot_df.values,
                'y': le_plot.fit_transform(dummy_y_plot_raw.values),
                'label_encoder': le_plot, 'original_X_columns': ['dummy_feat1', 'dummy_feat2']
            }
            benchmarker.X_columns = ['dummy_feat1', 'dummy_feat2']; benchmarker.y_column = 'label'
            print(f"Dummy test data created for plotting: X shape {benchmarker.test_data['X'].shape}, y shape {benchmarker.test_data['y'].shape}")

    if benchmarker.models and benchmarker.test_data:
        if not benchmarker.results:
            print("Plotting Test: No benchmark results found. Attempting to run benchmarks now...")
            # Determine which models can run on current self.test_data['X']
            features_in_test_data = benchmarker.test_data['X'].shape[1]

            if "LSTM_IDS_Model" in benchmarker.models and features_in_test_data == 27:
                print("Benchmarking LSTM_IDS_Model...")
                benchmarker.benchmark_model("LSTM_IDS_Model")
            elif "LSTM_IDS_Model" in benchmarker.models :
                 print(f"Skipping LSTM benchmark for plotting test: test data has {features_in_test_data} features, LSTM might expect 27.")


            if "Dummy_Sklearn_Model" in benchmarker.models:
                if features_in_test_data == 2: # If current test data is 2-feature dummy
                    print("Benchmarking Dummy_Sklearn_Model on current 2-feature test data...")
                    benchmarker.benchmark_model("Dummy_Sklearn_Model")
                else: # Create specific 2-feature test data for the dummy model
                     print("Current test data has >2 features. Creating specific 2-feature test for Dummy_Sklearn_Model.")
                     original_test_data_backup = benchmarker.test_data # Backup original
                     X_dummy_test_for_plot = np.random.rand(30,2)
                     y_dummy_test_for_plot = np.random.randint(0,2,30)
                     benchmarker.test_data = {'X': X_dummy_test_for_plot, 'y': y_dummy_test_for_plot,
                                              'label_encoder': LabelEncoder().fit(y_dummy_test_for_plot)}
                     benchmarker.benchmark_model("Dummy_Sklearn_Model")
                     benchmarker.test_data = original_test_data_backup # Restore

        if benchmarker.results:
            print("\n-- Generating Comparison Plots (using available results) --")
            plots_save_dir = PLOTS_DIR if 'PLOTS_DIR' in globals() and os.path.exists(PLOTS_DIR) else "benchmark_plots_output_s4"
            os.makedirs(plots_save_dir, exist_ok=True)

            benchmarker.plot_comparison(save_path=os.path.join(plots_save_dir, "test_perf_compare.png"))
            benchmarker.plot_latency_comparison(save_path=os.path.join(plots_save_dir, "test_latency_comparison.png"))
            benchmarker.plot_memory_comparison(save_path=os.path.join(plots_save_dir, "test_memory_comparison.png"))
            benchmarker.plot_confusion_matrices(save_dir=plots_save_dir)
            benchmarker.plot_roc_curves(save_path=os.path.join(plots_save_dir, "test_roc_curves.png"))
        else:
            print("⚠️ Plotting Test: Still no benchmark results after trying to run benchmarks. Plots will be skipped.")
    else:
        print("⚠️ Plotting Test: Models or test data not available in benchmarker. Plots cannot be generated.")

    print("\n--- End of ModelBenchmarker (Part 3) Test ---")

print("\n✅ Section 4 (Benchmarking - `ModelBenchmarker` Class - Part 3: Comparison & Plotting) is ready.")


--- Testing ModelBenchmarker (Part 3: Plotting) ---
⚠️ ModelBenchmarker instance not found from Section 2. Re-initializing for this test.
Plotting Test: No test data in benchmarker from Section 2. Creating minimal dummy data.
Dummy test data created for plotting: X shape (50, 2), y shape (50,)
Plotting Test: No benchmark results found. Attempting to run benchmarks now...
Benchmarking Dummy_Sklearn_Model on current 2-feature test data...

-- Generating Comparison Plots (using available results) --
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/plots/test_perf_compare.png
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/plots/test_latency_comparison.png
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/plots/test_memory_comparison.png
ROC plot saved: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/plots/test_roc_curve

In [20]:
# Imports from Section 1 should still be in effect.
# ModelBenchmarker class (Parts 1, 2 & 3) should be defined from previous sections.
# Ensure os, json, datetime are available (should be from Section 1 imports).

class ModelBenchmarker:
    """
    Class for benchmarking and comparing different machine learning models
    for intrusion detection.
    (Includes methods from Parts 1, 2 & 3, and adds new methods from Part 4)
    """

    def __init__(self):
        self.models = {}
        self.results = {}
        self.test_data = None
        self.X_columns = None
        self.y_column = None
        # print("ModelBenchmarker Initialized.") # Keep less verbose

    def load_model(self, model_path, model_name=None):
        if not os.path.exists(model_path):
            print(f"❌ Error: Model file not found at {model_path}"); return False
        if model_name is None: model_name = os.path.basename(model_path).split('.')[0]
        try:
            model_type = 'unknown'; model_instance = None
            if model_path.endswith('.h5'):
                model_instance = keras_load_model(model_path); model_type = 'keras'
            elif model_path.endswith(('.pkl', '.joblib')):
                with open(model_path, 'rb') as f: loaded_object = pickle.load(f)
                if isinstance(loaded_object, dict) and 'model' in loaded_object: model_instance = loaded_object['model']
                else: model_instance = loaded_object
                model_type = 'sklearn' if hasattr(model_instance, 'predict') else 'unknown_pickle'
            else:
                with open(model_path, 'rb') as f: model_instance = pickle.load(f)
                model_type = 'sklearn_generic' if hasattr(model_instance, 'predict') else 'unknown_generic'
            if model_instance is None: print(f"❌ Failed to load model instance from {model_path}."); return False
            self.models[model_name] = {'model': model_instance, 'path': model_path, 'type': model_type}
            return True
        except Exception as e:
            print(f"❌ Error loading model '{model_name}' from {model_path}: {e}"); return False

    def load_test_data(self, data_path, X_columns=None, y_column=None,
                       test_only=True, test_size_if_split=0.2,
                       scaler_path=None, reshape_for_lstm_load_time=False):
        try:
            if data_path.endswith('.csv'): data_df_raw = pd.read_csv(data_path, low_memory=False)
            elif data_path.endswith(('.json', '.jsonl')): data_df_raw = pd.read_json(data_path, lines=data_path.endswith('.jsonl'))
            else: print(f"❌ Unsupported test data file format: {data_path}"); return False
            data_df = data_df_raw.copy()
            cols_to_check_dropna = []
            if y_column is None: y_column = data_df.columns[-1]
            self.y_column = y_column
            if self.y_column not in data_df.columns: print(f"❌ Target column '{self.y_column}' not found."); return False
            cols_to_check_dropna.append(self.y_column)
            if X_columns is None: X_columns = [col for col in data_df.columns if col != self.y_column]
            self.X_columns = X_columns
            missing_X_cols = [col for col in self.X_columns if col not in data_df.columns]
            if missing_X_cols: print(f"❌ Features not found: {missing_X_cols}"); return False
            cols_to_check_dropna.extend(self.X_columns)
            data_df.replace([np.inf, -np.inf], np.nan, inplace=True)
            data_df.dropna(subset=list(set(cols_to_check_dropna)), inplace=True)
            if data_df.empty: print("❌ Test data empty after NaN drop."); return False
            current_data_for_processing = data_df
            if not test_only:
                stratify_col = current_data_for_processing[self.y_column] if len(current_data_for_processing[self.y_column].unique()) > 1 else None
                _, current_data_for_processing = train_test_split(current_data_for_processing, test_size=test_size_if_split, random_state=42, stratify=stratify_col)
            y_test_raw = current_data_for_processing[self.y_column]
            X_test_raw_features = current_data_for_processing[self.X_columns].copy()
            X_test_numeric = X_test_raw_features.select_dtypes(include=np.number)
            if X_test_numeric.empty: print("❌ No numeric features in X_test."); return False
            self.X_columns = X_test_numeric.columns.tolist()
            X_test_processed = X_test_numeric.copy()
            if scaler_path and os.path.exists(scaler_path):
                try:
                    with open(scaler_path, 'rb') as f: scaler = pickle.load(f)
                    if hasattr(scaler, 'mean_') and scaler.mean_ is not None:
                         X_test_scaled_values = scaler.transform(X_test_processed)
                         X_test_processed = pd.DataFrame(X_test_scaled_values, columns=X_test_processed.columns, index=X_test_processed.index)
                except Exception as e: print(f"❌ Error applying scaler from {scaler_path}: {e}. Using unscaled.")
            elif scaler_path: print(f"⚠️ Scaler not found at {scaler_path}. Using unscaled.")
            label_encoder = LabelEncoder(); y_test_processed = label_encoder.fit_transform(y_test_raw)
            X_test_final_np = X_test_processed.values if isinstance(X_test_processed, pd.DataFrame) else np.array(X_test_processed)
            if reshape_for_lstm_load_time: X_test_final_np = X_test_final_np.reshape((X_test_final_np.shape[0], 1, X_test_final_np.shape[1]))
            self.test_data = {'X': X_test_final_np, 'y': y_test_processed, 'label_encoder': label_encoder, 'feature_names': self.X_columns }
            return True
        except Exception as e:
            print(f"❌ Error in load_test_data: {e}"); self.test_data=None; return False

    def benchmark_model(self, model_name):
        if model_name not in self.models: print(f"❌ Model '{model_name}' not found."); return None
        if self.test_data is None: print(f"❌ Test data not loaded for '{model_name}'."); return None
        model_info=self.models[model_name]; model=model_info['model']; model_type=model_info['type']
        X_test_input = self.test_data['X']; y_test_true = self.test_data['y']
        X_test_for_model = np.array(X_test_input).copy()
        is_model_lstm_type = (model_type == 'keras' and hasattr(model, 'layers') and any(isinstance(layer, tf.keras.layers.LSTM) for layer in model.layers))
        if is_model_lstm_type and len(X_test_for_model.shape) == 2: X_test_for_model = X_test_for_model.reshape(X_test_for_model.shape[0], 1, X_test_for_model.shape[1])
        elif not is_model_lstm_type and len(X_test_for_model.shape) == 3 and X_test_for_model.shape[1] == 1: X_test_for_model = X_test_for_model.reshape(X_test_for_model.shape[0], -1)
        start_time=time.time(); gc.collect(); process=psutil.Process(os.getpid()); memory_before=process.memory_info().rss/(1024*1024)
        y_pred_classes, y_pred_proba = None, None
        try:
            if model_type == 'keras':
                y_pred_raw = model.predict(X_test_for_model, verbose=0)
                if len(y_pred_raw.shape) > 1 and y_pred_raw.shape[1] > 1: y_pred_classes = np.argmax(y_pred_raw, axis=1); y_pred_proba = y_pred_raw
                else: y_pred_classes = (y_pred_raw > 0.5).astype(int).ravel(); y_pred_proba = np.column_stack((1-y_pred_raw.ravel(), y_pred_raw.ravel()))
            elif model_type.startswith('sklearn'):
                y_pred_classes = model.predict(X_test_for_model)
                if hasattr(model, 'predict_proba'): y_pred_proba = model.predict_proba(X_test_for_model)
            else: return None
        except ValueError as ve: print(f"❌ ValueError predicting for '{model_name}': {ve}"); return None
        except Exception as e: print(f"❌ Error predicting for '{model_name}': {e}"); return None
        prediction_time=time.time()-start_time; gc.collect(); memory_after=process.memory_info().rss/(1024*1024); memory_used=memory_after-memory_before
        if y_test_true.ndim > 1 and y_test_true.shape[1]==1: y_test_true=y_test_true.ravel()
        if y_pred_classes.ndim > 1 and y_pred_classes.shape[1]==1: y_pred_classes=y_pred_classes.ravel()
        unique_labels_in_data = self.test_data['label_encoder'].classes_
        accuracy=accuracy_score(y_test_true,y_pred_classes); precision=precision_score(y_test_true,y_pred_classes,average='weighted',zero_division=0,labels=np.arange(len(unique_labels_in_data))); recall=recall_score(y_test_true,y_pred_classes,average='weighted',zero_division=0,labels=np.arange(len(unique_labels_in_data))); f1=f1_score(y_test_true,y_pred_classes,average='weighted',zero_division=0,labels=np.arange(len(unique_labels_in_data)))
        cm=confusion_matrix(y_test_true,y_pred_classes,labels=np.arange(len(unique_labels_in_data))); report=classification_report(y_test_true,y_pred_classes,output_dict=True,zero_division=0,labels=np.arange(len(unique_labels_in_data)), target_names=[str(cls) for cls in unique_labels_in_data])
        roc_auc_s, fpr_d, tpr_d = None, None, None
        if y_pred_proba is not None and len(unique_labels_in_data)==2 and y_pred_proba.ndim==2 and y_pred_proba.shape[1]==2:
             fpr_d,tpr_d,_=roc_curve(y_test_true,y_pred_proba[:,1]); roc_auc_s=auc(fpr_d,tpr_d)
        res = {'model_name':model_name,'model_type':model_type,'accuracy':float(accuracy),'precision':float(precision),'recall':float(recall),'f1_score':float(f1),'confusion_matrix':cm.tolist(),'classification_report':report,'prediction_time_total_s':float(prediction_time),'prediction_time_per_sample_ms':float(prediction_time/len(X_test_for_model)*1000) if len(X_test_for_model)>0 else 0,'memory_increase_mb':float(memory_used),'roc_auc':float(roc_auc_s) if roc_auc_s is not None else None,'fpr':fpr_d.tolist() if fpr_d is not None else None,'tpr':tpr_d.tolist() if tpr_d is not None else None,'timestamp':datetime.now().isoformat(),'num_test_samples':len(X_test_for_model)}
        self.results[model_name]=res;
        return res

    def benchmark_all_models(self):
        if not self.models: print("No models loaded."); return {}
        if self.test_data is None: print("No test data loaded."); return {}
        for model_name_key in list(self.models.keys()): self.benchmark_model(model_name_key)
        return self.results

    def compare_models(self):
        if not self.results: print("No benchmark results to compare."); return None
        metrics_to_compare = ['accuracy', 'precision', 'recall', 'f1_score', 'prediction_time_per_sample_ms', 'memory_increase_mb', 'roc_auc']
        comparison_data = {metric: {} for metric in metrics_to_compare}
        best_models_summary = {metric: {'model': None, 'value': None} for metric in metrics_to_compare}
        for model_name, result in self.results.items():
            for metric in metrics_to_compare:
                value = result.get(metric);
                if value is not None : comparison_data[metric][model_name] = value
        for metric in metrics_to_compare:
            if not comparison_data[metric]: continue
            if metric in ['prediction_time_per_sample_ms', 'memory_increase_mb']: best_model_name = min(comparison_data[metric], key=comparison_data[metric].get)
            else: best_model_name = max(comparison_data[metric], key=comparison_data[metric].get)
            best_models_summary[metric]['model'] = best_model_name; best_models_summary[metric]['value'] = comparison_data[metric][best_model_name]
        return {'metrics_data': comparison_data, 'best_models_summary': best_models_summary, 'timestamp': datetime.now().isoformat()}

    def plot_comparison(self, metrics_to_plot=None, save_path=None, title_suffix=""):
        if not self.results: print("No results to plot."); return None
        if metrics_to_plot is None: metrics_to_plot = ['accuracy', 'f1_score']
        df_data = []; valid_metrics_plotted = []
        for model_name, res in self.results.items():
            row = {'Model': model_name}
            for metric in metrics_to_plot:
                if res.get(metric) is not None: row[metric.replace('_', ' ').title()] = res.get(metric);
                if metric not in valid_metrics_plotted and res.get(metric) is not None : valid_metrics_plotted.append(metric)
            if len(row) > 1: df_data.append(row)
        if not df_data: print(f"No data to plot for metrics: {metrics_to_plot}"); return None
        plot_df = pd.DataFrame(df_data).set_index('Model');
        valid_metric_titles = [m.replace('_',' ').title() for m in valid_metrics_plotted if m.replace('_',' ').title() in plot_df.columns]
        if not valid_metric_titles: print(f"No valid metric titles for plot: {metrics_to_plot}"); return None # Added check
        plot_df = plot_df[valid_metric_titles]
        plot_df = plot_df.dropna(axis=1, how='all')
        if plot_df.empty: print(f"Not enough valid data for comparison plot for metrics: {valid_metrics_plotted}"); return None
        plot_df.plot(kind='bar', figsize=(8 + len(plot_df.index)*0.5, 6), rot=30, width=0.8)
        plt.title(f"Model Comparison: {', '.join(plot_df.columns)} {title_suffix}"); plt.ylabel("Score / Value"); plt.tight_layout();
        if save_path: plt.savefig(save_path); print(f"Plot saved to {save_path}"); plt.close(); return save_path
        else: plt.show(); return None

    def plot_confusion_matrices(self, save_dir=None):
        if not self.results: print("No results for CMs."); return None
        saved_paths = []
        for model_name, res in self.results.items():
            cm = np.array(res.get('confusion_matrix', []))
            if cm.size == 0: continue
            plt.figure(figsize=(6,5)); sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False); plt.title(f'CM: {model_name}'); plt.ylabel('True'); plt.xlabel('Predicted'); plt.tight_layout()
            if save_dir: path = os.path.join(save_dir, f"cm_{model_name}.png"); plt.savefig(path); saved_paths.append(path); plt.close()
            else: plt.show()
        return saved_paths if save_dir and saved_paths else None

    def plot_roc_curves(self, save_path=None):
        if not self.results: print("No results for ROC."); return None
        plt.figure(figsize=(8,6)); any_roc_plotted = False
        for model_name, res in self.results.items():
            fpr_list, tpr_list, roc_auc_val = res.get('fpr'), res.get('tpr'), res.get('roc_auc')
            if isinstance(fpr_list, list) and isinstance(tpr_list, list) and isinstance(roc_auc_val, float):
                plt.plot(fpr_list, tpr_list, label=f'{model_name} (AUC = {roc_auc_val:.3f})'); any_roc_plotted = True
        if not any_roc_plotted: print("No valid ROC curve data in results."); plt.close(); return None
        plt.plot([0,1],[0,1],'k--'); plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate'); plt.title('ROC Curves'); plt.legend(loc='lower right'); plt.tight_layout()
        if save_path: plt.savefig(save_path); print(f"ROC plot saved: {save_path}"); plt.close(); return save_path
        else: plt.show(); return None

    def plot_precision_recall_curves(self, save_dir=None): # Placeholder from original script
        print("plot_precision_recall_curves: Functionality to plot actual curves requires storing y_scores " \
              "from each model's prediction probabilities. Currently, only average precision might be available via classification_report if calculated.")
        return None

    def plot_latency_comparison(self, save_path=None):
        return self.plot_comparison(metrics_to_plot=['prediction_time_per_sample_ms'], save_path=save_path, title_suffix="- Latency (ms/sample)")

    def plot_memory_comparison(self, save_path=None):
        return self.plot_comparison(metrics_to_plot=['memory_increase_mb'], save_path=save_path, title_suffix="- Memory Increase (MB)")

    # --- New methods for Section 5 ---
    def generate_comparison_report(self, output_dir=None):
        """
        Generate a comprehensive comparison report with plots and metrics.
        Saves JSON results and calls plotting functions.
        """
        if not self.results:
            print("No benchmark results available to generate a report.")
            return None

        if output_dir is None:
            report_time_str = datetime.now().strftime('%Y%m%d_%H%M%S')
            base_results_dir = RESULTS_DIR if 'RESULTS_DIR' in globals() and os.path.exists(RESULTS_DIR) else "."
            output_dir = os.path.join(base_results_dir, f"model_benchmark_report_{report_time_str}")

        os.makedirs(output_dir, exist_ok=True)
        current_report_plots_dir = os.path.join(output_dir, "plots")
        os.makedirs(current_report_plots_dir, exist_ok=True)

        print(f"\nGenerating comparison report in: {output_dir}")

        comparison_summary = self.compare_models()

        plot_paths = {}
        plot_paths['performance'] = self.plot_comparison(
            metrics_to_plot=['accuracy', 'precision', 'recall', 'f1_score'],
            save_path=os.path.join(current_report_plots_dir, "performance_metrics_comparison.png")
        )
        plot_paths['latency'] = self.plot_latency_comparison(
            save_path=os.path.join(current_report_plots_dir, "latency_comparison.png")
        )
        plot_paths['memory'] = self.plot_memory_comparison(
            save_path=os.path.join(current_report_plots_dir, "memory_usage_comparison.png")
        )
        plot_paths['confusion_matrices_list'] = self.plot_confusion_matrices(save_dir=current_report_plots_dir)
        plot_paths['roc_curves_combined'] = self.plot_roc_curves(save_path=os.path.join(current_report_plots_dir, "roc_curves_comparison.png"))

        all_results_path = os.path.join(output_dir, "all_benchmark_results.json")
        try:
            with open(all_results_path, 'w') as f: json.dump(self.results, f, indent=2, cls=NpEncoder)
            print(f"Detailed benchmark results saved to: {all_results_path}")
        except Exception as e: print(f"❌ Error saving detailed benchmark results: {e}")

        comparison_summary_path = None
        if comparison_summary:
            comparison_summary_path = os.path.join(output_dir, "comparison_summary_metrics.json")
            try:
                with open(comparison_summary_path, 'w') as f: json.dump(comparison_summary, f, indent=2, cls=NpEncoder)
                print(f"Comparison summary metrics saved to: {comparison_summary_path}")
            except Exception as e: print(f"❌ Error saving comparison summary: {e}")

        html_report_path = self.generate_html_report(
            comparison_summary_data=comparison_summary,
            all_results_data=self.results,
            plot_paths_dict=plot_paths,
            base_report_dir=output_dir
        )

        final_report_info = {
            'report_directory': output_dir, 'all_results_json': all_results_path,
            'comparison_summary_json': comparison_summary_path,
            'plot_paths_dict': plot_paths, 'html_report_path': html_report_path
        }
        print(f"\n✅ Comprehensive benchmark report generated successfully in {output_dir}")
        return final_report_info

    def generate_html_report(self, comparison_summary_data, all_results_data, plot_paths_dict, base_report_dir):
        if not all_results_data:
            print("Cannot generate HTML: No all_results_data provided."); return None

        output_html_path = os.path.join(base_report_dir, "benchmark_report.html")

        html_content = f"""<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>IDS-AI Model Benchmarking Report</title>
        <style> body {{ font-family: Arial, sans-serif; margin: 20px; line-height: 1.6; color: #333; }} h1, h2, h3 {{ color: #2c3e50; border-bottom: 1px solid #bdc3c7; padding-bottom: 5px; }} h1 {{ text-align: center; }} table {{ border-collapse: collapse; width: 95%; margin: 20px auto; box-shadow: 0 0 5px #ccc; }} th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }} th {{ background-color: #3498db; color: white; }} tr:nth-child(even) {{ background-color: #f8f9f9; }} .plot-container {{ text-align: center; margin:15px 0; padding:10px; border:1px solid #eee; border-radius:4px;}} .plot {{ max-width:80%; height:auto; border:1px solid #ccc;}} .section {{ margin-bottom:30px; padding:15px; background-color:#f4f6f6; border-radius:5px;}} .highlight {{ background-color: #fff3cd; font-weight: bold; }} details > summary {{ cursor: pointer; font-weight: bold; color: #2980b9; margin-bottom: 5px;}}</style></head><body>
            <h1>IDS-AI Model Benchmarking Report</h1><p>Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>"""

        if comparison_summary_data and 'metrics_data' in comparison_summary_data:
            html_content += "<div class='section'><h2>Overall Performance Metrics</h2><table><tr><th>Model</th>"
            metric_headers = sorted(list(comparison_summary_data['metrics_data'].keys()))
            for mh in metric_headers: html_content += f"<th>{mh.replace('_',' ').title()}</th>"
            html_content += "</tr>"
            model_names_sorted = sorted(list(all_results_data.keys()))
            for mn in model_names_sorted:
                html_content += f"<tr><td>{mn}</td>"
                for mh in metric_headers:
                    val = comparison_summary_data['metrics_data'][mh].get(mn, 'N/A')
                    fmt_val = f"{val:.4f}" if isinstance(val,float) and mh not in ['prediction_time_per_sample_ms','memory_increase_mb'] else (f"{val:.2f}" if isinstance(val,float) else str(val))
                    is_best = comparison_summary_data.get('best_models_summary',{}).get(mh,{}).get('model') == mn
                    html_content += f"<td class='{'highlight' if is_best else ''}'>{fmt_val}</td>"
                html_content += "</tr>"
            html_content += "</table></div>"

        plot_display_order_map = {'performance': ("Performance Metrics Comparison", plot_paths_dict.get('performance')), 'latency': ("Prediction Latency Comparison", plot_paths_dict.get('latency')), 'memory': ("Memory Usage Increase Comparison", plot_paths_dict.get('memory')), 'roc_curves_combined': ("ROC Curves (Combined)", plot_paths_dict.get('roc_curves_combined')), 'confusion_matrices_list': ("Confusion Matrices", plot_paths_dict.get('confusion_matrices_list'))}
        for plot_key, (title, plot_item) in plot_display_order_map.items():
            if plot_item:
                html_content += f"<div class='section'><h2>{title}</h2>"
                if plot_key == 'confusion_matrices_list' and isinstance(plot_item, list):
                    for cm_abs_path in plot_item:
                        if cm_abs_path and os.path.exists(cm_abs_path):
                            cm_filename = os.path.basename(cm_abs_path); model_name_from_cm = cm_filename.replace('cm_', '').replace('.png', '').replace('_', ' ').title()
                            html_content += f"<h3>{model_name_from_cm}</h3><div class='plot-container'><img class='plot' src='plots/{cm_filename}' alt='CM {model_name_from_cm}'></div>"
                elif isinstance(plot_item, str) and os.path.exists(plot_item):
                    plot_filename = os.path.basename(plot_item)
                    html_content += f"<div class='plot-container'><img class='plot' src='plots/{plot_filename}' alt='{title}'></div>"
                html_content += "</div>"

        html_content += "<div class='section'><h2>Detailed Classification Reports</h2>"
        for model_name, res_data in all_results_data.items():
            html_content += f"<details><summary>{model_name}</summary><table><tr><th>Class/Metric</th><th>Precision</th><th>Recall</th><th>F1-Score</th><th>Support</th></tr>"
            if 'classification_report' in res_data and isinstance(res_data['classification_report'], dict):
                for lbl, m_dict in res_data['classification_report'].items():
                    if isinstance(m_dict, dict): html_content+=f"<tr><td>{lbl}</td><td>{m_dict.get('precision',0.0):.4f}</td><td>{m_dict.get('recall',0.0):.4f}</td><td>{m_dict.get('f1-score',0.0):.4f}</td><td>{m_dict.get('support','')}</td></tr>"
            html_content += "</table></details>"
        html_content += "</div></body></html>"
        try:
            with open(output_html_path, 'w', encoding='utf-8') as f: f.write(html_content)
            print(f"✅ HTML report saved to {output_html_path}"); return output_html_path
        except Exception as e: print(f"❌ Error saving HTML report: {e}"); return None

# Helper class for JSON encoding
if 'NpEncoder' not in globals(): # Define only if not already defined (e.g. if running cells out of order)
    class NpEncoder(json.JSONEncoder):
        def default(self, obj):
            if isinstance(obj, np.integer): return int(obj)
            if isinstance(obj, np.floating): return float(obj)
            if isinstance(obj, np.ndarray): return obj.tolist()
            return super(NpEncoder, self).default(obj)

# --- Test Block for ModelBenchmarker (Part 4: Report Generation) ---
if __name__ == "__main__" and 'google.colab' in sys.modules:
    print("\n--- Testing ModelBenchmarker (Part 4: Report Generation) ---")

    if 'benchmarker' not in globals() or not isinstance(benchmarker, ModelBenchmarker):
        print("⚠️ ModelBenchmarker instance not found from previous sections. Re-initializing for this test.")
        benchmarker = ModelBenchmarker()
        dummy_model_path_s5 = os.path.join(MODEL_DIR if 'MODEL_DIR' in globals() else ".", "dummy_sklearn_model.pkl")
        if not os.path.exists(dummy_model_path_s5): # Ensure dummy model pkl exists for test
            from sklearn.linear_model import LogisticRegression
            temp_model_sk = LogisticRegression(); temp_model_sk.fit(np.random.rand(10,2), np.random.randint(0,2,10))
            os.makedirs(os.path.dirname(dummy_model_path_s5), exist_ok=True)
            with open(dummy_model_path_s5, 'wb') as f: pickle.dump(temp_model_sk, f)
            print(f"Created dummy model for report test: {dummy_model_path_s5}")
        benchmarker.load_model(dummy_model_path_s5, "DummyModelForReport")

        # Create minimal dummy results if none exist, for the loaded dummy model
        if not benchmarker.results and "DummyModelForReport" in benchmarker.models:
            print("Creating dummy benchmark results for 'DummyModelForReport' for report generation test...")
            # Ensure self.test_data is set up with compatible (e.g., 2-feature) data
            dummy_X_report = np.random.rand(20,2)
            dummy_y_report_raw = np.random.choice(['Normal', 'Attack'], 20)
            le_rep = LabelEncoder()
            benchmarker.test_data = {
                'X': dummy_X_report,
                'y': le_rep.fit_transform(dummy_y_report_raw),
                'label_encoder': le_rep,
                'feature_names': ['dummy_feat1', 'dummy_feat2']
            }
            benchmarker.benchmark_model("DummyModelForReport") # Benchmark this specific model

    if benchmarker.results:
        print("\n-- Generating Full Comparison Report --")
        # Ensure RESULTS_DIR is defined and exists
        report_output_base_dir = RESULTS_DIR if 'RESULTS_DIR' in globals() and os.path.exists(RESULTS_DIR) else "benchmark_report_output_s5_default"
        os.makedirs(report_output_base_dir, exist_ok=True) # Ensure base for timestamped folder exists

        # generate_comparison_report will create its own timestamped subdirectory within report_output_base_dir
        report_info = benchmarker.generate_comparison_report(output_dir=None) # Let it create timestamped dir

        if report_info and report_info.get('html_report_path'):
            print(f"Full report generated in directory: {report_info['report_directory']}")
            print(f"HTML report should be viewable at: {report_info['html_report_path']}")
            # To display HTML in Colab (optional):
            # from IPython.display import HTML, display
            # if os.path.exists(report_info['html_report_path']):
            #    display(HTML(filename=report_info['html_report_path']))
            # else:
            #    print(f"HTML file not found at {report_info['html_report_path']} for display.")
        else:
            print("⚠️ Report generation failed or produced no HTML output path.")
    else:
        print("⚠️ No benchmark results available in benchmarker. Cannot generate report.")

    print("\n--- End of ModelBenchmarker (Part 4) Test ---")

print("\n✅ Section 5 (Benchmarking - `ModelBenchmarker` Class - Part 4: Report Generation) is ready.")


--- Testing ModelBenchmarker (Part 4: Report Generation) ---
⚠️ ModelBenchmarker instance not found from previous sections. Re-initializing for this test.
Creating dummy benchmark results for 'DummyModelForReport' for report generation test...

-- Generating Full Comparison Report --

Generating comparison report in: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010407
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010407/plots/performance_metrics_comparison.png
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010407/plots/latency_comparison.png
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010407/plots/memory_usage_comparison.png
ROC plot saved: /content/drive/MyDrive/IDS_AI_Suite/mo

In [21]:
# Imports from Section 1 should still be in effect.
# ModelBenchmarker class (all parts) should be defined from previous sections.
# Global directories (BASE_DIR, RESULTS_DIR, etc.) should also be defined.

def main():
    """
    Main function to run the model comparison and benchmarking.
    Handles argument parsing (simulated for Colab) and uses ModelBenchmarker.
    """
    # For Colab, we simulate command-line arguments.
    # In a standalone .py script, argparse would parse them.
    class SimulatedBenchmarkingArgs:
        def __init__(self):
            # --- Paths to your trained models ---
            # You'll need to provide paths to the models you want to compare.
            # Example: Your LSTM model and the dummy sklearn model
            self.models = [
                "/content/drive/MyDrive/Colab Notebooks/results/lstm_model.h5", # Path to your LSTM model
                # Add paths to other models, e.g., the dummy sklearn model we created:
                # os.path.join(MODEL_DIR, "dummy_sklearn_model.pkl") # MODEL_DIR from Section 1
            ]
            # For testing, ensure dummy_sklearn_model.pkl exists in MODEL_DIR or provide its full path

            # --- Path to your test dataset ---
            self.data = "/content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv"

            # --- Feature columns and target column for your dataset ---
            self.X_columns = [ # Based on your MLP_LSTM.ipynb
                'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
                'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
                'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport', 'tcp.flags.ack',
                'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu', 'dns.qry.type',
                'dns.retransmission', 'dns.retransmit_request', 'dns.retransmit_request_in',
                'mqtt.conflag.cleansess', 'mqtt.hdrflags', 'mqtt.len', 'mqtt.msg_decoded_as',
                'mbtcp.len', 'mbtcp.trans_id', 'mbtcp.unit_id'
            ]
            self.y_column = 'Attack_label' # Target column for ML-EdgeIIoT-dataset.csv

            # --- Path to your saved scaler (from training the models) ---
            self.scaler_path = "/content/drive/MyDrive/Colab Notebooks/results/scaler.pkl"

            # --- Output directory for results ---
            # If None, ModelBenchmarker.generate_comparison_report will create a timestamped one
            self.output_dir = None
            # Example: self.output_dir = os.path.join(RESULTS_DIR, "my_benchmark_run_1")

            # --- Flags ---
            # The original script had --compare-fl and related args.
            # This simplified main function focuses on the ModelBenchmarker's primary role.
            # If FL vs CL comparison is needed here, it would require integrating
            # the compare_federated_vs_centralized function and its specific args.
            # For now, we focus on benchmarking a list of pre-trained models.

    args = SimulatedBenchmarkingArgs()

    print("--- Model Benchmarking System ---")
    print(f"Models to benchmark: {args.models}")
    print(f"Test dataset: {args.data}")
    print(f"Output directory for report (if None, timestamped dir created): {args.output_dir or os.path.join(RESULTS_DIR, 'timestamped_report')}")

    benchmarker = ModelBenchmarker()

    # 1. Load Test Data
    if not args.data or not os.path.exists(args.data):
        print(f"❌ Test data file not found at '{args.data}'. Cannot proceed with benchmarking.")
        return

    data_loaded = benchmarker.load_test_data(
        data_path=args.data,
        X_columns=args.X_columns,
        y_column=args.y_column,
        scaler_path=args.scaler_path if args.scaler_path and os.path.exists(args.scaler_path) else None,
        test_only=True # Assume the provided data is a dedicated test set
    )
    if not data_loaded or benchmarker.test_data is None:
        print("❌ Failed to load or process test data. Aborting benchmark.")
        return

    # 2. Load Models
    models_loaded_count = 0
    if args.models:
        for model_path_arg in args.models:
            if model_path_arg and os.path.exists(model_path_arg):
                benchmarker.load_model(model_path_arg) # Model name will be derived from filename
                if os.path.basename(model_path_arg).split('.')[0] in benchmarker.models:
                    models_loaded_count+=1
            else:
                print(f"⚠️ Model path specified but not found: {model_path_arg}")

    if models_loaded_count == 0: # Check if any models were actually loaded
        print("❌ No models were successfully loaded. Aborting benchmark.")
        # Try to create and load the dummy model as a fallback for testing the script structure
        print("Attempting to load/create a dummy model for structural testing...")
        dummy_model_path_main = os.path.join(MODEL_DIR if 'MODEL_DIR' in globals() else ".", "dummy_main_sklearn_model.pkl")
        if not os.path.exists(dummy_model_path_main):
            try:
                from sklearn.linear_model import LogisticRegression
                temp_model_sk = LogisticRegression(); temp_model_sk.fit(np.random.rand(10,2), np.random.randint(0,2,10))
                with open(dummy_model_path_main, 'wb') as f: pickle.dump(temp_model_sk, f)
            except Exception as e_create_dummy: print(f"Error creating dummy model: {e_create_dummy}")

        if os.path.exists(dummy_model_path_main):
            benchmarker.load_model(dummy_model_path_main, "DummySklearnForMain")
            if "DummySklearnForMain" in benchmarker.models:
                 print(f"Loaded DummySklearnForMain. Test data features might not match its training (2 features).")
                 # For the dummy model, if the main test_data has many features, benchmarking will fail for it.
                 # This script assumes all models benchmarked can use the same self.test_data.
                 # For a robust dummy test here, we'd need to create specific 2-feature test data if only dummy is loaded.
            else:
                print("Could not load/create even a dummy model. Benchmark cannot proceed.")
                return
        else:
            print("Could not load/create even a dummy model. Benchmark cannot proceed.")
            return


    # 3. Benchmark All Loaded Models
    if benchmarker.models and benchmarker.test_data:
        print("\n--- Starting Benchmarking Process ---")
        benchmarker.benchmark_all_models()
    else:
        print("❌ Cannot start benchmarking: No models loaded or no test data available.")
        return

    # 4. Generate Comparison Report
    if benchmarker.results:
        print("\n--- Generating Final Comparison Report ---")
        benchmarker.generate_comparison_report(output_dir=args.output_dir)
    else:
        print("ℹ️ No benchmark results were generated. Report cannot be created.")

    print("\n🏁 Model Benchmarking Main Execution Finished.")

# This block ensures main() is called when the cell is run in Colab
if __name__ == "__main__" and 'google.colab' in sys.modules:
    print("\n--- Running main() for Model Comparison and Benchmarking ---")

    essential_items = ['ModelBenchmarker', 'BASE_DIR', 'RESULTS_DIR', 'MODEL_DIR', 'PLOTS_DIR', 'DATA_DIR']
    all_defined = True
    for item_name in essential_items:
        if item_name not in globals():
            print(f"🔴 CRITICAL ERROR for main(): Required item '{item_name}' is not defined. "
                  "Ensure all previous code sections (1-5) ran successfully.")
            all_defined = False
            break

    if all_defined:
        main()
    else:
        print("\n❌ Main function execution aborted due to missing definitions from previous sections.")

print("\n✅ Section 6 (Benchmarking - Main Execution Block) is ready.")


--- Running main() for Model Comparison and Benchmarking ---
--- Model Benchmarking System ---
Models to benchmark: ['/content/drive/MyDrive/Colab Notebooks/results/lstm_model.h5']
Test dataset: /content/drive/MyDrive/Colab Notebooks/datasets/ML-EdgeIIoT-dataset.csv
Output directory for report (if None, timestamped dir created): /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/timestamped_report





--- Starting Benchmarking Process ---

--- Generating Final Comparison Report ---

Generating comparison report in: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010430
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010430/plots/performance_metrics_comparison.png
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010430/plots/latency_comparison.png
Plot saved to /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010430/plots/memory_usage_comparison.png
ROC plot saved: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarking_outputs/benchmark_outputs/model_benchmark_report_20250529_010430/plots/roc_curves_comparison.png
Detailed benchmark results saved to: /content/drive/MyDrive/IDS_AI_Suite/model_benchmarki