In [1]:

"""Dashboard

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1xQzHjBGJvAS2lhIWQbs1lXpHJW4UiDkY
"""

# rt_sensor_simulator.py and rt_attack_simulator.py definitions would be here
# For brevity, I'm omitting them as the main app content below doesn't directly call their generators.
# If you want to integrate them as data sources, that would be an additional step.

# ... (Keep the rt_sensor_simulator.py and rt_attack_simulator.py code from your file) ...

# dashboard.py (main Streamlit app)

# 1. First verify and clean up existing ngrok configurations
!rm -rf /root/.config/ngrok # This is a shell command, typically run in a notebook cell
!mkdir -p /root/.config/ngrok # This is a shell command

# 2. Set up your ngrok authtoken
# Replace YOUR_NGROK_AUTH_TOKEN with your actual token
!ngrok config add-authtoken YOUR_NGROK_AUTH_TOKEN


# 4. Create the enhanced Streamlit app file with CSV data pipeline
import os
app_content = """
import streamlit as st
import pandas as pd
import numpy as np
import time
from datetime import datetime
import altair as alt
from collections import defaultdict, deque
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler # For LSTM, original scaler
from sklearn.metrics import confusion_matrix, classification_report
import joblib # For loading scikit-learn models

# Conditional import for TensorFlow
try:
    from tensorflow.keras.models import load_model
    TENSORFLOW_AVAILABLE = True
except ImportError:
    TENSORFLOW_AVAILABLE = False
    st.sidebar.warning("TensorFlow/Keras not found. LSTM model will be unavailable.")

# --- Configuration ---
# Paths to your trained models and scaler
BASE_PATH = "/content/drive/MyDrive/Colab Notebooks/" # Adjust if your Drive is mounted differently
RESULTS_PATH = os.path.join(BASE_PATH, "results")
DATA_PATH = os.path.join(BASE_PATH, "datasets/ML-EdgeIIoT-dataset.csv")

# Scikit-learn models and common scaler
SKLEARN_MODEL_FILES = {
    "RandomForest": os.path.join(RESULTS_PATH, "randomforest_model.joblib"),
    "DecisionTree": os.path.join(RESULTS_PATH, "decisiontree_model.joblib"),
    "LogisticRegression": os.path.join(RESULTS_PATH, "logisticregression_model.joblib"),
    "XGBoost": os.path.join(RESULTS_PATH, "xgboost_model.joblib"),
    "SVM": os.path.join(RESULTS_PATH, "svm_model.joblib") # Assuming you have an SVM model saved
}
COMMON_SCALER_PATH = os.path.join(RESULTS_PATH, "common_scaler.joblib") # Scaler for scikit-learn models

# LSTM Model
LSTM_MODEL_PATH = os.path.join(BASE_PATH, "lstm_model.h5") # Path to your trained LSTM model

# Features: CRITICAL - This list MUST match the features your models were trained on.
# These are the 27 features from our previous scikit-learn model training.
# The LSTM model might require a different set or preprocessing. This example assumes
# we can extract these 27 for all models for demonstration, or that LSTM is adapted.
FEATURE_COLUMNS = [
    'arp.hw.size', 'http.content_length', 'http.response', 'http.tls_port',
    'tcp.ack_raw', 'tcp.checksum', 'tcp.connection.fin', 'tcp.connection.rst',
    'tcp.connection.syn', 'tcp.connection.synack', 'tcp.dstport',
    'tcp.flags.ack', 'tcp.len', 'udp.stream', 'udp.time_delta', 'dns.qry.qu',
    'dns.qry.type', 'dns.retransmission', 'dns.retransmit_request',
    'dns.retransmit_request_in', 'mqtt.conflag.cleansess', 'mqtt.hdrflags',
    'mqtt.len', 'mqtt.msg_decoded_as', 'mbtcp.len', 'mbtcp.trans_id',
    'mbtcp.unit_id'
]
LSTM_SEQUENCE_LENGTH = 1 # Example: LSTM looks at one timestep at a time

# --- Session State Initialization ---
if 'data' not in st.session_state:
    st.session_state.data = {
        'events': deque(maxlen=500), # Store recent raw samples
        'predictions_log': deque(maxlen=500), # Store dicts with {'actual':, 'predicted':, 'proba':}
        'attack_stats': defaultdict(int), # Counts for attack types
        'model_metrics': {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0},
        'common_scaler_sklearn': None, # Scaler for scikit-learn models
        'lstm_scaler': MinMaxScaler(), # Separate scaler for LSTM if needed, or use common one
        'data_iterator': None,
        'selected_model_name': None,
        'loaded_models': {} # Cache for loaded model objects
    }
if 'page_loaded' not in st.session_state: # To fit scaler only once for LSTM example
    st.session_state.page_loaded = False


# --- Model and Scaler Loading ---
@st.cache_resource
def load_sklearn_models_and_scaler():
    models = {}
    for name, path in SKLEARN_MODEL_FILES.items():
        if os.path.exists(path):
            try:
                models[name] = joblib.load(path)
            except Exception as e:
                st.sidebar.error(f"Failed to load {name}: {e}")
        else:
            st.sidebar.warning(f"Scikit-learn model not found: {path}")

    if os.path.exists(COMMON_SCALER_PATH):
        try:
            scaler = joblib.load(COMMON_SCALER_PATH)
            st.sidebar.success("✅ Common Scaler (for scikit-learn) loaded.")
            return models, scaler
        except Exception as e:
            st.sidebar.error(f"Failed to load Common Scaler: {e}")
            return models, None
    else:
        st.sidebar.warning(f"Common Scaler not found: {COMMON_SCALER_PATH}")
        return models, None

@st.cache_resource
def load_lstm_model_keras():
    if not TENSORFLOW_AVAILABLE:
        return None
    if os.path.exists(LSTM_MODEL_PATH):
        try:
            model = load_model(LSTM_MODEL_PATH)
            st.sidebar.success("✅ LSTM Model loaded.")
            return model
        except Exception as e:
            st.sidebar.error(f"LSTM Model loading failed: {e}")
            return None
    else:
        st.sidebar.warning(f"LSTM Model not found: {LSTM_MODEL_PATH}")
        return None

# Load models into session state's "loaded_models" dictionary
sklearn_models_loaded, common_scaler_loaded = load_sklearn_models_and_scaler()
if common_scaler_loaded:
    st.session_state.data['common_scaler_sklearn'] = common_scaler_loaded
for name, model in sklearn_models_loaded.items():
    st.session_state.data['loaded_models'][name] = model

if TENSORFLOW_AVAILABLE:
    lstm_model_loaded = load_lstm_model_keras()
    if lstm_model_loaded:
        st.session_state.data['loaded_models']['LSTM'] = lstm_model_loaded


# --- Data Pipeline ---
class EdgeIIoTDataIterator:
    def __init__(self, file_path, feature_columns):
        self.feature_columns = feature_columns
        try:
            # Attempt to load only necessary columns to save memory
            # This requires knowing which original CSV columns map to your FEATURE_COLUMNS
            # For simplicity, loading all then selecting. Optimize if memory is an issue.
            self.df = pd.read_csv(file_path, low_memory=False)
            st.sidebar.info(f"Raw CSV loaded with {len(self.df)} rows, {len(self.df.columns)} columns.")

            # Preprocessing: Fill missing for selected features and 'Attack_label'
            for col in self.feature_columns:
                if col not in self.df.columns:
                    st.warning(f"Feature column '{col}' not found in CSV! Filling with 0.")
                    self.df[col] = 0
                else:
                    # Ensure numeric, fill NaNs for feature columns
                    self.df[col] = pd.to_numeric(self.df[col], errors='coerce').fillna(0)

            if 'Attack_label' in self.df.columns:
                self.df['is_attack'] = self.df['Attack_label'].apply(lambda x: 0 if str(x) == '0' else 1)
            else:
                st.warning("'Attack_label' not found. Assuming all data is normal or needs manual labeling.")
                self.df['is_attack'] = 0 # Default to normal if no label

            # Keep only necessary columns for iteration to save memory
            self.df_processed = self.df[self.feature_columns + ['is_attack']].copy()
            st.sidebar.success(f"Dataset processed. Using {len(self.feature_columns)} features.")

        except FileNotFoundError:
            st.error(f"Dataset CSV not found at {file_path}. Please check the path.")
            self.df_processed = pd.DataFrame() # Empty df
        except Exception as e:
            st.error(f"Error loading or processing dataset: {str(e)}")
            self.df_processed = pd.DataFrame() # Empty df

        self.current_idx = 0


    def __iter__(self):
        return self

    def __next__(self):
        if self.df_processed.empty or self.current_idx >= len(self.df_processed):
            self.current_idx = 0 # Loop dataset
            if not self.df_processed.empty:
                # st.sidebar.warning("♻️ Reached end of dataset - looping.")
                pass # Avoid too many warnings
            else:
                st.error("No data to iterate. Stopping simulation.")
                raise StopIteration # Stop if df is empty from the start

        row = self.df_processed.iloc[self.current_idx]
        self.current_idx += 1

        sample_features = {col: float(row[col]) for col in self.feature_columns}
        sample = {
            'timestamp': datetime.now().strftime('%H:%M:%S.%f')[:-3],
            'is_attack': int(row['is_attack']), # Ground truth
            **sample_features
        }
        return sample

# Initialize data iterator if not already done or if it's None
if st.session_state.data.get('data_iterator') is None and DATA_PATH:
    st.session_state.data['data_iterator'] = EdgeIIoTDataIterator(DATA_PATH, FEATURE_COLUMNS)

# --- Preprocessing and Prediction Functions ---
def preprocess_and_predict(sample_features_dict, model_name, feature_columns):
    # Prepare 2D array for scikit-learn and initial LSTM step
    features_2d = np.array([[sample_features_dict[col] for col in feature_columns]])

    current_model = st.session_state.data['loaded_models'].get(model_name)
    if current_model is None:
        return 0, 0.0, "Model not loaded" # Predicted class, probability, status

    # Preprocessing
    if model_name == "LSTM":
        # LSTM specific scaling (example: fit once, then transform)
        # In a real scenario, this scaler should be saved/loaded like common_scaler_sklearn
        if not st.session_state.page_loaded: # Simplified: fit on first batch seen by dashboard
            st.session_state.data['lstm_scaler'].fit(features_2d)
            # This is not ideal; LSTM scaler should be pre-fitted and loaded.
            # For now, we make a note or assume it's handled if a pre-trained LSTM is used.

        model_input = st.session_state.data['lstm_scaler'].transform(features_2d)
        model_input = model_input.reshape(1, LSTM_SEQUENCE_LENGTH, len(feature_columns))
        status_message = "LSTM Preprocessed"
    else: # Scikit-learn models
        scaler_sklearn = st.session_state.data.get('common_scaler_sklearn')
        if scaler_sklearn is None:
            return 0, 0.0, "Scaler for scikit-learn models not loaded"
        model_input = scaler_sklearn.transform(features_2d)
        status_message = "Scikit-learn Preprocessed"

    # Prediction
    try:
        if model_name == "LSTM":
            prediction_prob = current_model.predict(model_input, verbose=0)[0][0]
            predicted_class = int(prediction_prob > 0.5)
        else: # Scikit-learn models
            predicted_class = current_model.predict(model_input)[0]
            if hasattr(current_model, "predict_proba"):
                # Probability of the positive class (assuming class 1 is attack)
                prediction_prob = current_model.predict_proba(model_input)[0][1]
            else: # Models like SVM without probability=True
                prediction_prob = 1.0 if predicted_class == 1 else 0.0
        return int(predicted_class), float(prediction_prob), status_message
    except Exception as e:
        return 0, 0.0, f"Prediction error: {str(e)}"

# --- Dashboard UI ---
st.set_page_config(layout="wide", page_title="Enhanced IDS Dashboard")
st.title("🛡️ Edge-IIoT Intrusion Detection System Dashboard")

# Sidebar for controls
st.sidebar.header("⚙️ Controls & Settings")
available_model_names = [name for name in st.session_state.data['loaded_models'].keys() if st.session_state.data['loaded_models'][name] is not None]
if not available_model_names:
    st.sidebar.error("No models were loaded successfully!")
    st.stop()

st.session_state.data['selected_model_name'] = st.sidebar.selectbox(
    "🧠 Select Model:",
    available_model_names,
    index=0 # Default to the first available model
)
active_model_name = st.session_state.data['selected_model_name']
st.sidebar.info(f"Active Model: **{active_model_name}**")

update_interval_ms = st.sidebar.slider("⏱️ Update Interval (ms)", 100, 2000, 500)
history_length = st.sidebar.slider("📊 History Length (samples)", 50, 500, 100)

# Adjust deque maxlen dynamically (optional, can be fixed)
if st.session_state.data['events'].maxlen != history_length:
    st.session_state.data['events'] = deque(st.session_state.data['events'], maxlen=history_length)
    st.session_state.data['predictions_log'] = deque(st.session_state.data['predictions_log'], maxlen=history_length)


# Main layout
col1, col2 = st.columns([2, 1]) # Adjust column ratio if needed

with col1:
    st.subheader("🚦 Real-time Traffic & Detections")
    event_plot_placeholder = st.empty()

    st.subheader("📈 Prediction Confidence/Probability")
    confidence_plot_placeholder = st.empty()

with col2:
    st.subheader("📊 Detection Statistics (Recent History)")
    stats_placeholder = st.empty()

    st.subheader("⚠️ Recent Alerts")
    alerts_placeholder = st.empty()

    if active_model_name not in ["LSTM", "SVM"]: # LSTM/SVM might not have simple feature_importances_
        model_object_for_fi = st.session_state.data['loaded_models'].get(active_model_name)
        if hasattr(model_object_for_fi, 'feature_importances_') or hasattr(model_object_for_fi, 'coef_'):
            st.subheader(f"Feature Importance ({active_model_name})")
            fi_placeholder = st.empty()


# --- Main Processing Loop ---
data_iterator = st.session_state.data.get('data_iterator')

if data_iterator is None:
    st.error("Data iterator not initialized. Please check dataset path and configuration.")
    st.stop()

# Mark page as loaded after initial setup and first pass of data processing potentially
if not st.session_state.page_loaded:
    st.session_state.page_loaded = True


# Main display loop
while True: # Loop indefinitely for a real-time feel
    try:
        sample = next(data_iterator) # Get next data sample
    except StopIteration:
        st.warning("Data source depleted (if not set to loop). Dashboard will pause.")
        break # Or implement a reset/reload mechanism
    except Exception as e:
        st.error(f"Error fetching data: {e}")
        time.sleep(5) # Wait before retrying
        continue

    # Extract features for model
    sample_features = {k: sample[k] for k in FEATURE_COLUMNS}

    predicted_class, prediction_prob, pred_status = preprocess_and_predict(
        sample_features, active_model_name, FEATURE_COLUMNS
    )

    if "error" in pred_status.lower() or "not loaded" in pred_status.lower():
        st.sidebar.error(pred_status)
        # Potentially skip this iteration or handle error
    else:
        if st.sidebar.checkbox("Show Preprocessing Status", False):
             st.sidebar.caption(pred_status)


    # Log event and prediction
    st.session_state.data['events'].append(sample)
    log_entry = {
        'timestamp': sample['timestamp'],
        'actual': sample['is_attack'],
        'predicted': predicted_class,
        'probability': prediction_prob,
        'model': active_model_name
    }
    st.session_state.data['predictions_log'].append(log_entry)

    # Update confusion matrix stats based on the latest prediction
    actual = sample['is_attack']
    pred = predicted_class
    if actual == 1 and pred == 1: st.session_state.data['model_metrics']['tp'] += 1
    elif actual == 0 and pred == 1: st.session_state.data['model_metrics']['fp'] += 1
    elif actual == 0 and pred == 0: st.session_state.data['model_metrics']['tn'] += 1
    elif actual == 1 and pred == 0: st.session_state.data['model_metrics']['fn'] += 1


    # --- Update Visualizations ---
    events_df = pd.DataFrame(list(st.session_state.data['events']))
    predictions_log_df = pd.DataFrame(list(st.session_state.data['predictions_log']))

    # Event Plot (Example: showing one feature and attack indicators)
    if not events_df.empty:
        with event_plot_placeholder.container():
            # Choose a key feature to plot, e.g., 'flow_duration' or 'Rate' if available
            # For now, we plot a generic "value" which would be the prediction probability
            # and mark actual vs predicted attacks.
            chart_data = predictions_log_df.copy()
            chart_data['time_obj'] = pd.to_datetime(chart_data['timestamp'], format='%H:%M:%S.%f')

            base_chart = alt.Chart(chart_data).encode(x='time_obj:T')

            line_prob = base_chart.mark_line().encode(
                y=alt.Y('probability:Q', title='Attack Probability', scale=alt.Scale(domain=[0, 1])),
                tooltip=['timestamp:N', 'probability:Q', 'actual:N', 'predicted:N']
            ).properties(title="Attack Prediction Probability Over Time")

            # Points for actual attacks
            actual_attacks = base_chart.transform_filter(
                alt.datum.actual == 1
            ).mark_circle(size=100, color='red', opacity=0.7).encode(
                y=alt.Y('probability:Q'),
                tooltip=['timestamp:N', 'actual:N', 'predicted:N']
            )
            # Points for predicted attacks
            predicted_attacks = base_chart.transform_filter(
                alt.datum.predicted == 1
            ).mark_point(shape='diamond', size=80, color='orange', opacity=0.7, filled=True).encode(
                y=alt.Y('probability:Q'),
                tooltip=['timestamp:N', 'actual:N', 'predicted:N']
            )
            st.altair_chart(line_prob + actual_attacks + predicted_attacks, use_container_width=True)


    # Confidence Plot (if probabilities are available)
    if not predictions_log_df.empty:
        with confidence_plot_placeholder.container():
            conf_df = predictions_log_df[['timestamp', 'probability']].copy()
            conf_df['time_obj'] = pd.to_datetime(conf_df['timestamp'], format='%H:%M:%S.%f')
            conf_chart = alt.Chart(conf_df).mark_bar().encode(
                x='time_obj:T',
                y=alt.Y('probability:Q', title='Prediction Probability', scale=alt.Scale(domain=(0,1))),
                color=alt.condition(
                    alt.datum.probability > 0.5,
                    alt.value('red'),  # Color for attack
                    alt.value('green') # Color for normal
                )
            ).properties(title="Prediction Confidence Scores", height=200)
            st.altair_chart(conf_chart, use_container_width=True)

    # Stats (Confusion Matrix and Metrics)
    with stats_placeholder.container():
        tp = st.session_state.data['model_metrics']['tp']
        fp = st.session_state.data['model_metrics']['fp']
        tn = st.session_state.data['model_metrics']['tn']
        fn = st.session_state.data['model_metrics']['fn']

        st.write(f"**Confusion Matrix (Overall for session):**")
        cm_data = [[tn, fp], [fn, tp]]
        cm_df = pd.DataFrame(cm_data, columns=['Predicted Normal', 'Predicted Attack'], index=['Actual Normal', 'Actual Attack'])
        st.table(cm_df)

        accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        st.metric("Accuracy", f"{accuracy:.2%}")
        col_m1, col_m2, col_m3 = st.columns(3)
        col_m1.metric("Precision", f"{precision:.2%}")
        col_m2.metric("Recall (Sensitivity)", f"{recall:.2%}")
        col_m3.metric("F1-Score", f"{f1:.2%}")

    # Recent Alerts
    with alerts_placeholder.container():
        if not predictions_log_df.empty:
            recent_alerts_df = predictions_log_df[predictions_log_df['predicted'] == 1].tail(5)
            st.write("**Recent Detected Attacks:**")
            if not recent_alerts_df.empty:
                st.dataframe(recent_alerts_df[['timestamp', 'probability', 'actual']], use_container_width=True)
            else:
                st.info("No attacks detected in recent history.")

    # Feature Importances / Coefficients
    if active_model_name not in ["LSTM", "SVM"] and ('fi_placeholder' in locals() or 'fi_placeholder' in globals()): # Check if placeholder exists
        model_obj = st.session_state.data['loaded_models'].get(active_model_name)
        importances = None
        if hasattr(model_obj, 'feature_importances_'):
            importances = model_obj.feature_importances_
        elif hasattr(model_obj, 'coef_'):
            importances = model_obj.coef_[0] # For linear models, take the first (and often only) set of coeffs

        if importances is not None:
            with fi_placeholder.container():
                fi_df = pd.DataFrame({'feature': FEATURE_COLUMNS, 'importance': importances})
                fi_df = fi_df.sort_values('importance', ascending=False).head(10) # Top 10
                fi_chart = alt.Chart(fi_df).mark_bar().encode(
                    x='importance:Q',
                    y=alt.Y('feature:N', sort='-x')
                ).properties(title=f"Top Feature Importances", height=250)
                st.altair_chart(fi_chart, use_container_width=True)

    time.sleep(update_interval_ms / 1000) # Control update speed

"""

# Write the app content to app.py
with open('app.py', 'w') as f:
    f.write(app_content)

# 5. Run Streamlit with proper ngrok configuration (ensure ngrok token is set above)
import threading
import time
# from pyngrok import ngrok # pyngrok can be problematic in some Colab setups, using direct proxy

def run_streamlit():
    # Kill existing streamlit processes to avoid port conflicts
    os.system('pkill -f streamlit')
    time.sleep(2)
    os.system(f'streamlit run app.py --server.port 8501 --server.headless true --server.enableCORS false --server.enableXsrfProtection false')

# Start Streamlit in background
thread = threading.Thread(target=run_streamlit, daemon=True)
thread.start()
time.sleep(5)  # Wait for Streamlit to initialize

# Create ngrok tunnel or use Colab proxy
# Using Colab proxy as ngrok can be flaky or require more setup
from google.colab.output import eval_js
print(f"\n📊 Access your dashboard at: {eval_js('google.colab.kernel.proxyPort(8501)')}")
print("If the link doesn't work, try the ngrok section below (uncomment and configure).")

# # Alternative: ngrok (if Colab proxy fails or for external access)
# # Ensure your ngrok authtoken is configured earlier in the notebook
# # from pyngrok import ngrok, conf
# # try:
# #     public_url = ngrok.connect(addr='8501', proto='http')
# #     print(f"\n✅ Dashboard is live via ngrok at: {public_url}\n")
# # except Exception as e:
# #     print(f"\n❌ Error creating ngrok tunnel: {e}")
# #     print("Make sure your ngrok authtoken is set correctly if using this method.")

/bin/bash: line 1: ngrok: command not found

📊 Access your dashboard at: https://8501-m-s-2o85h5vwfbbuk-b.us-west3-0.prod.colab.dev
If the link doesn't work, try the ngrok section below (uncomment and configure).
