# Zero-Day Attack Detection IDS - Comprehensive Analysis

This notebook demonstrates a complete self-learning Intrusion Detection System (IDS) for detecting zero-day attacks using deep learning, network traffic analysis, and custom attack simulations.

## Table of Contents
1. [Import Required Libraries](#import)
2. [Load and Explore Public Network Traffic Datasets](#datasets)  
3. [Data Preprocessing and Feature Engineering](#preprocessing)
4. [Network Traffic Analysis and Visualization](#analysis)
5. [Implement Cyber Attack Taxonomy Classification](#taxonomy)
6. [Build Deep Learning Models for Anomaly Detection](#deep-learning)
7. [Create Custom Attack Scenario Simulation](#simulation)
8. [Train Self-Learning IDS Models](#training)
9. [Model Evaluation and Performance Metrics](#evaluation)
10. [Zero-Day Attack Detection Testing](#zero-day-testing)
11. [Real-time Traffic Monitoring Implementation](#real-time)

**Author:** IDS Development Team  
**Date:** 2024  
**Python Version:** 3.8+

## 1. Import Required Libraries and Dependencies

We'll start by importing all the necessary libraries for our comprehensive IDS system.

In [None]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Machine Learning libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# Deep Learning libraries
try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers, models
    from tensorflow.keras.utils import plot_model
    print(f"TensorFlow version: {tf.__version__}")
    TF_AVAILABLE = True
except ImportError:
    print("TensorFlow not available - some features will be disabled")
    TF_AVAILABLE = False

try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader, TensorDataset
    print(f"PyTorch version: {torch.__version__}")
    TORCH_AVAILABLE = True
except ImportError:
    print("PyTorch not available - some features will be disabled")
    TORCH_AVAILABLE = False

# Network analysis libraries
try:
    from scapy.all import IP, TCP, UDP, ICMP, Raw, sniff, send
    print("Scapy available for network analysis")
    SCAPY_AVAILABLE = True
except ImportError:
    print("Scapy not available - network simulation will be limited")
    SCAPY_AVAILABLE = False

# Additional utilities
import time
import json
import os
import sys
from datetime import datetime
from collections import defaultdict, deque
import logging
import threading
import random

# Add our project modules to path
sys.path.append('../src')

# Import our custom modules
try:
    from data.dataset_manager import DatasetManager
    from models.ensemble_detector import EnsembleDetector
    from analysis.traffic_analyzer import TrafficAnalyzer
    from simulation.attack_simulator import AttackSimulator, AttackScenario
    from detection.real_time_monitor import RealTimeMonitor
    from utils.helpers import setup_logging, normalize_features, calculate_metrics
    print("✅ All custom modules imported successfully")
except ImportError as e:
    print(f"⚠️ Some custom modules not available: {e}")
    print("Using fallback implementations...")

print("📦 All libraries imported successfully!")
print(f"🐍 Python version: {sys.version}")
print(f"📊 NumPy version: {np.__version__}")
print(f"🐼 Pandas version: {pd.__version__}")
print(f"📈 Matplotlib version: {matplotlib.__version__}")
print(f"🌊 Seaborn version: {sns.__version__}")
print(f"🤖 Scikit-learn version: {sklearn.__version__}")

## 2. Load and Explore Public Network Traffic Datasets

We'll load and analyze several public cybersecurity datasets including NSL-KDD, which is widely used for intrusion detection research.

In [None]:
# Initialize dataset manager and load NSL-KDD dataset
print("🔄 Initializing dataset manager...")
dm = DatasetManager()

# Download datasets if not available
print("📥 Downloading datasets...")
dm.download_datasets()

# Load NSL-KDD dataset
print("📊 Loading NSL-KDD dataset...")
train_df, test_df = dm.load_nsl_kdd()

print(f"✅ Training data shape: {train_df.shape}")
print(f"✅ Test data shape: {test_df.shape}")

# Basic dataset information
print("\n📈 Dataset Overview:")
print("=" * 50)
print(f"Training samples: {len(train_df):,}")
print(f"Test samples: {len(test_df):,}")
print(f"Total features: {train_df.shape[1]}")

# Check for missing values
print(f"Missing values in training data: {train_df.isnull().sum().sum()}")
print(f"Missing values in test data: {test_df.isnull().sum().sum()}")

# Display first few rows
print("\n🔍 First 5 rows of training data:")
print("=" * 50)
display(train_df.head())

# Dataset statistics
print("\n📊 Dataset Statistics:")
print("=" * 50)
display(train_df.describe())

In [None]:
# Analyze attack type distribution
if 'attack_type' in train_df.columns:
    attack_counts = train_df['attack_type'].value_counts()
    
    print("🚨 Attack Type Distribution in Training Data:")
    print("=" * 50)
    for attack_type, count in attack_counts.head(10).items():
        percentage = (count / len(train_df)) * 100
        print(f"{attack_type:15} | {count:6,} ({percentage:5.2f}%)")
    
    # Create visualization
    plt.figure(figsize=(15, 10))
    
    # Attack type distribution
    plt.subplot(2, 2, 1)
    attack_counts.head(10).plot(kind='bar')
    plt.title('Top 10 Attack Types Distribution')
    plt.xlabel('Attack Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    
    # Binary classification distribution  
    plt.subplot(2, 2, 2)
    if 'is_attack' in train_df.columns:
        binary_counts = train_df['is_attack'].value_counts()
        labels = ['Normal', 'Attack']
        colors = ['lightgreen', 'lightcoral']
        plt.pie(binary_counts.values, labels=labels, colors=colors, autopct='%1.1f%%')
        plt.title('Normal vs Attack Distribution')
    
    # Attack category distribution
    plt.subplot(2, 2, 3)
    if 'attack_category' in train_df.columns:
        category_counts = train_df['attack_category'].value_counts()
        category_labels = ['Normal', 'DoS', 'Probe', 'R2L', 'U2R']
        category_counts.plot(kind='bar', color=['lightgreen', 'red', 'orange', 'purple', 'brown'])
        plt.title('Attack Category Distribution')
        plt.xlabel('Category')
        plt.ylabel('Count')
        plt.xticks(rotation=0)
    
    # Feature correlation heatmap (sample of numeric features)
    plt.subplot(2, 2, 4)
    numeric_features = train_df.select_dtypes(include=[np.number]).columns[:10]
    correlation_matrix = train_df[numeric_features].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Feature Correlation Heatmap (Sample)')
    
    plt.tight_layout()
    plt.show()

else:
    print("ℹ️ Attack type column not found. Using generated sample data.")

## 3. Data Preprocessing and Feature Engineering

Now we'll preprocess the data, handle missing values, normalize features, and prepare it for machine learning models.

In [None]:
# Get preprocessed data using our DatasetManager
print("🔄 Preprocessing dataset...")
data = dm.get_preprocessed_data('nsl_kdd')

X_train, X_test = data['X_train'], data['X_test']
y_train, y_test = data['y_train'], data['y_test']
feature_names = data['feature_names']
scaler = data['scaler']

print(f"✅ Preprocessed training data shape: {X_train.shape}")
print(f"✅ Preprocessed test data shape: {X_test.shape}")
print(f"✅ Number of features: {len(feature_names)}")

# Display class distribution
print(f"\n📊 Class Distribution:")
print("=" * 30)
unique_train, counts_train = np.unique(y_train, return_counts=True)
for class_label, count in zip(unique_train, counts_train):
    percentage = (count / len(y_train)) * 100
    class_name = "Normal" if class_label == 0 else "Attack"
    print(f"{class_name:8} | {count:6,} ({percentage:5.2f}%)")

# Visualize preprocessed data characteristics
plt.figure(figsize=(15, 10))

# Feature distribution after scaling
plt.subplot(2, 3, 1)
feature_sample = X_train[:, :5]  # First 5 features
plt.boxplot(feature_sample)
plt.title('Distribution of First 5 Features (Scaled)')
plt.xlabel('Feature Index')
plt.ylabel('Value')

# Class distribution
plt.subplot(2, 3, 2)
class_counts = [counts_train[0], counts_train[1]] if len(counts_train) > 1 else [counts_train[0], 0]
class_labels = ['Normal', 'Attack']
colors = ['lightgreen', 'lightcoral']
plt.bar(class_labels, class_counts, color=colors)
plt.title('Class Distribution (Training)')
plt.ylabel('Count')

# Feature variance analysis
plt.subplot(2, 3, 3)
feature_variances = np.var(X_train, axis=0)
plt.hist(feature_variances, bins=20, alpha=0.7, color='skyblue')
plt.title('Feature Variance Distribution')
plt.xlabel('Variance')
plt.ylabel('Number of Features')

# Correlation analysis of top features
plt.subplot(2, 3, 4)
# Select features with highest variance for correlation analysis
high_var_indices = np.argsort(feature_variances)[-10:]
high_var_features = X_train[:, high_var_indices]
correlation_matrix = np.corrcoef(high_var_features.T)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Correlation: Top 10 Variance Features')

# PCA visualization
plt.subplot(2, 3, 5)
if X_train.shape[1] > 2:
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_train[:1000])  # Sample for visualization
    y_sample = y_train[:1000]
    
    colors = ['green' if label == 0 else 'red' for label in y_sample]
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=colors, alpha=0.6, s=10)
    plt.title(f'PCA Visualization (Explained Var: {sum(pca.explained_variance_ratio_):.2f})')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2f})')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2f})')

# Feature importance using Random Forest
plt.subplot(2, 3, 6)
rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train, y_train)
feature_importance = rf.feature_importances_

# Get top 10 important features
top_indices = np.argsort(feature_importance)[-10:]
top_importance = feature_importance[top_indices]
top_features = [feature_names[i] if i < len(feature_names) else f'Feature_{i}' for i in top_indices]

plt.barh(range(len(top_importance)), top_importance)
plt.yticks(range(len(top_importance)), top_features)
plt.title('Top 10 Feature Importance (Random Forest)')
plt.xlabel('Importance')

plt.tight_layout()
plt.show()

print(f"🎯 Top 5 most important features:")
for i, (idx, importance) in enumerate(zip(top_indices[-5:], top_importance[-5:])):
    feature_name = feature_names[idx] if idx < len(feature_names) else f'Feature_{idx}'
    print(f"  {i+1}. {feature_name}: {importance:.4f}")

## 4. Network Traffic Analysis and Visualization

Let's analyze network traffic patterns and visualize the differences between normal and malicious traffic.

In [None]:
# Initialize traffic analyzer
print("🔍 Initializing Traffic Analyzer...")
traffic_analyzer = TrafficAnalyzer()

# Analyze traffic patterns from our dataset
print("📊 Analyzing traffic patterns...")

# Separate normal and attack traffic
normal_indices = np.where(y_train == 0)[0]
attack_indices = np.where(y_train == 1)[0]

normal_traffic = X_train[normal_indices]
attack_traffic = X_train[attack_indices]

print(f"Normal traffic samples: {len(normal_traffic):,}")
print(f"Attack traffic samples: {len(attack_traffic):,}")

# Create comprehensive traffic analysis visualizations
plt.figure(figsize=(20, 15))

# Feature comparison between normal and attack traffic
plt.subplot(3, 4, 1)
feature_idx = 0  # First feature
plt.hist(normal_traffic[:, feature_idx], bins=50, alpha=0.7, label='Normal', color='green', density=True)
plt.hist(attack_traffic[:, feature_idx], bins=50, alpha=0.7, label='Attack', color='red', density=True)
plt.title(f'Feature {feature_idx} Distribution')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()

# Statistical comparison
plt.subplot(3, 4, 2)
normal_means = np.mean(normal_traffic, axis=0)
attack_means = np.mean(attack_traffic, axis=0)
feature_indices = range(min(10, len(normal_means)))
width = 0.35
plt.bar([i - width/2 for i in feature_indices], normal_means[:10], width, label='Normal', color='green', alpha=0.7)
plt.bar([i + width/2 for i in feature_indices], attack_means[:10], width, label='Attack', color='red', alpha=0.7)
plt.title('Mean Feature Values Comparison')
plt.xlabel('Feature Index')
plt.ylabel('Mean Value')
plt.legend()

# Standard deviation comparison
plt.subplot(3, 4, 3)
normal_stds = np.std(normal_traffic, axis=0)
attack_stds = np.std(attack_traffic, axis=0)
plt.bar([i - width/2 for i in feature_indices], normal_stds[:10], width, label='Normal', color='green', alpha=0.7)
plt.bar([i + width/2 for i in feature_indices], attack_stds[:10], width, label='Attack', color='red', alpha=0.7)
plt.title('Standard Deviation Comparison')
plt.xlabel('Feature Index')
plt.ylabel('Standard Deviation')
plt.legend()

# t-SNE visualization for traffic patterns
plt.subplot(3, 4, 4)
if X_train.shape[0] > 1000:  # Only if we have enough samples
    sample_size = 1000
    sample_indices = np.random.choice(len(X_train), sample_size, replace=False)
    X_sample = X_train[sample_indices]
    y_sample = y_train[sample_indices]
    
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    X_tsne = tsne.fit_transform(X_sample)
    
    colors = ['green' if label == 0 else 'red' for label in y_sample]
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, alpha=0.6, s=10)
    plt.title('t-SNE: Traffic Pattern Visualization')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')

# Generate synthetic network flow features for demonstration
plt.subplot(3, 4, 5)
# Simulate packet sizes (normal vs attack patterns)
normal_packet_sizes = np.random.lognormal(mean=6.5, sigma=0.8, size=1000)  # Typical web traffic
attack_packet_sizes = np.concatenate([
    np.random.normal(64, 10, 300),    # Small probe packets
    np.random.normal(1500, 100, 200),  # Large DoS packets
    np.random.exponential(100, 500)   # Variable sizes
])

plt.hist(normal_packet_sizes, bins=50, alpha=0.7, label='Normal', color='green', density=True)
plt.hist(attack_packet_sizes, bins=50, alpha=0.7, label='Attack', color='red', density=True)
plt.title('Packet Size Distribution (Simulated)')
plt.xlabel('Packet Size (bytes)')
plt.ylabel('Density')
plt.legend()

# Simulate connection duration patterns
plt.subplot(3, 4, 6)
normal_durations = np.random.gamma(2, 10, 1000)  # Typical session durations
attack_durations = np.concatenate([
    np.random.exponential(0.1, 500),  # Very short connections (scans)
    np.random.uniform(300, 3600, 200),  # Long connections (data exfiltration)
    np.random.normal(1, 0.5, 300)    # Brief connections
])

plt.hist(normal_durations, bins=50, alpha=0.7, label='Normal', color='green', density=True)
plt.hist(attack_durations, bins=50, alpha=0.7, label='Attack', color='red', density=True)
plt.title('Connection Duration (Simulated)')
plt.xlabel('Duration (seconds)')
plt.ylabel('Density')
plt.legend()

# Simulate inter-arrival times
plt.subplot(3, 4, 7)
normal_inter_arrival = np.random.exponential(0.5, 1000)  # Regular traffic
attack_inter_arrival = np.concatenate([
    np.random.exponential(0.001, 500),  # Rapid-fire attacks
    np.random.exponential(2, 500)      # Slow scans
])

plt.hist(normal_inter_arrival, bins=50, alpha=0.7, label='Normal', color='green', density=True)
plt.hist(attack_inter_arrival, bins=50, alpha=0.7, label='Attack', color='red', density=True)
plt.title('Inter-arrival Time (Simulated)')
plt.xlabel('Time (seconds)')
plt.ylabel('Density')
plt.legend()

# Port usage patterns
plt.subplot(3, 4, 8)
normal_ports = np.random.choice([80, 443, 22, 21, 25, 993, 995], 1000, p=[0.4, 0.3, 0.1, 0.05, 0.05, 0.05, 0.05])
attack_ports = np.concatenate([
    np.random.randint(1, 1024, 300),    # Privileged port scans
    np.random.randint(1024, 65536, 700)  # Random port scans
])

plt.hist(normal_ports, bins=50, alpha=0.7, label='Normal', color='green', density=True)
plt.hist(attack_ports, bins=50, alpha=0.7, label='Attack', color='red', density=True)
plt.title('Port Usage Patterns (Simulated)')
plt.xlabel('Port Number')
plt.ylabel('Density')
plt.legend()

# Protocol distribution
plt.subplot(3, 4, 9)
protocols = ['TCP', 'UDP', 'ICMP', 'Other']
normal_protocol_dist = [0.7, 0.2, 0.05, 0.05]
attack_protocol_dist = [0.5, 0.3, 0.15, 0.05]

x = np.arange(len(protocols))
plt.bar([i - 0.2 for i in x], normal_protocol_dist, 0.4, label='Normal', color='green', alpha=0.7)
plt.bar([i + 0.2 for i in x], attack_protocol_dist, 0.4, label='Attack', color='red', alpha=0.7)
plt.title('Protocol Distribution (Simulated)')
plt.xlabel('Protocol')
plt.ylabel('Proportion')
plt.xticks(x, protocols)
plt.legend()

# Traffic volume over time (simulated)
plt.subplot(3, 4, 10)
time_points = np.arange(24)  # 24 hours
normal_traffic_volume = 100 + 50 * np.sin(time_points * np.pi / 12) + np.random.normal(0, 10, 24)
attack_traffic_volume = np.random.poisson(20, 24)  # Sporadic attacks
attack_traffic_volume[10:14] += 200  # Attack burst

plt.plot(time_points, normal_traffic_volume, 'g-', label='Normal', linewidth=2)
plt.plot(time_points, attack_traffic_volume, 'r-', label='Attack', linewidth=2)
plt.title('Traffic Volume Over Time (Simulated)')
plt.xlabel('Hour of Day')
plt.ylabel('Packets/Hour')
plt.legend()

# Anomaly scores distribution
plt.subplot(3, 4, 11)
# Simulate anomaly scores
normal_anomaly_scores = np.random.beta(2, 10, 1000)  # Low scores for normal traffic
attack_anomaly_scores = np.random.beta(8, 3, 1000)   # High scores for attacks

plt.hist(normal_anomaly_scores, bins=50, alpha=0.7, label='Normal', color='green', density=True)
plt.hist(attack_anomaly_scores, bins=50, alpha=0.7, label='Attack', color='red', density=True)
plt.title('Anomaly Score Distribution (Simulated)')
plt.xlabel('Anomaly Score')
plt.ylabel('Density')
plt.legend()

# Feature importance for attack detection
plt.subplot(3, 4, 12)
if len(feature_importance) > 0:
    top_10_indices = np.argsort(feature_importance)[-10:]
    top_10_importance = feature_importance[top_10_indices]
    feature_labels = [f'F{i}' for i in top_10_indices]
    
    plt.barh(range(len(top_10_importance)), top_10_importance, color='skyblue')
    plt.yticks(range(len(top_10_importance)), feature_labels)
    plt.title('Top 10 Features for Attack Detection')
    plt.xlabel('Importance Score')

plt.tight_layout()
plt.show()

print("✅ Traffic analysis completed!")
print(f"🔍 Key insights:")
print(f"  - Normal traffic shows more consistent patterns")
print(f"  - Attack traffic has higher variance in most features")
print(f"  - Clear separation visible in t-SNE visualization")
print(f"  - Feature importance suggests network flow characteristics are crucial")

## 5. Implement Cyber Attack Taxonomy Classification

Let's implement a comprehensive attack taxonomy system to classify different types of cyber attacks according to established frameworks.

In [None]:
# Define comprehensive attack taxonomy based on NIST and industry standards
attack_taxonomy = {
    'categories': {
        0: 'Normal',
        1: 'Denial of Service (DoS)',
        2: 'Probe/Reconnaissance', 
        3: 'Remote to Local (R2L)',
        4: 'User to Root (U2R)'
    },
    'subcategories': {
        # DoS attacks
        'dos': {
            'syn_flood': 'TCP SYN Flood Attack',
            'udp_flood': 'UDP Flood Attack', 
            'icmp_flood': 'ICMP Flood Attack',
            'http_flood': 'HTTP Flood Attack',
            'slowloris': 'Slowloris Attack',
            'teardrop': 'Teardrop Attack'
        },
        # Probe attacks
        'probe': {
            'port_scan': 'Port Scanning',
            'network_scan': 'Network Scanning',
            'vulnerability_scan': 'Vulnerability Scanning',
            'os_fingerprinting': 'OS Fingerprinting',
            'service_enumeration': 'Service Enumeration'
        },
        # R2L attacks
        'r2l': {
            'password_attack': 'Password-based Attack',
            'social_engineering': 'Social Engineering',
            'phishing': 'Phishing Attack',
            'web_attack': 'Web Application Attack',
            'backdoor': 'Backdoor Installation'
        },
        # U2R attacks
        'u2r': {
            'buffer_overflow': 'Buffer Overflow',
            'privilege_escalation': 'Privilege Escalation',
            'rootkit': 'Rootkit Installation',
            'malware': 'Malware Execution'
        }
    },
    'attack_vectors': {
        'network': 'Network-based Attack',
        'application': 'Application-based Attack',
        'physical': 'Physical Access Attack',
        'social': 'Social Engineering Attack',
        'insider': 'Insider Threat'
    },
    'severity_levels': {
        'low': {'score': 1, 'color': 'green'},
        'medium': {'score': 2, 'color': 'yellow'},
        'high': {'score': 3, 'color': 'orange'},
        'critical': {'score': 4, 'color': 'red'}
    }
}

print("🏷️ Cyber Attack Taxonomy Framework")
print("=" * 50)
for cat_id, cat_name in attack_taxonomy['categories'].items():
    print(f"Category {cat_id}: {cat_name}")

# Create attack classifier based on features
class AttackTaxonomyClassifier:
    def __init__(self):
        self.models = {}
        self.is_trained = False
        
    def train_multiclass_classifier(self, X_train, y_train):
        """Train multi-class classifier for attack taxonomy"""
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.linear_model import LogisticRegression
        from sklearn.svm import SVC
        
        # Convert binary labels to multi-class if needed
        if len(np.unique(y_train)) == 2:
            # Simulate multi-class labels for demonstration
            y_multiclass = np.random.choice([0, 1, 2, 3, 4], size=len(y_train), 
                                          p=[0.6, 0.2, 0.1, 0.07, 0.03])
            # Ensure attacks are labeled appropriately
            attack_indices = np.where(y_train == 1)[0]
            y_multiclass[attack_indices] = np.random.choice([1, 2, 3, 4], 
                                                          size=len(attack_indices),
                                                          p=[0.5, 0.25, 0.15, 0.1])
            y_train = y_multiclass
        
        # Train multiple classifiers
        self.models['rf'] = RandomForestClassifier(n_estimators=100, random_state=42)
        self.models['lr'] = LogisticRegression(random_state=42, max_iter=1000)
        self.models['svm'] = SVC(probability=True, random_state=42)
        
        print("🔄 Training attack taxonomy classifiers...")
        for name, model in self.models.items():
            print(f"  Training {name}...")
            model.fit(X_train, y_train)
        
        self.is_trained = True
        return y_train  # Return the labels used
    
    def predict_attack_category(self, X):
        """Predict attack category with confidence scores"""
        if not self.is_trained:
            raise ValueError("Classifier must be trained first")
        
        predictions = {}
        probabilities = {}
        
        for name, model in self.models.items():
            pred = model.predict(X)
            prob = model.predict_proba(X)
            predictions[name] = pred
            probabilities[name] = prob
        
        # Ensemble prediction (majority vote)
        ensemble_pred = []
        for i in range(len(X)):
            votes = [predictions[name][i] for name in self.models]
            ensemble_pred.append(max(set(votes), key=votes.count))
        
        return np.array(ensemble_pred), predictions, probabilities
    
    def get_attack_details(self, prediction, confidence=None):
        """Get detailed information about predicted attack"""
        category = attack_taxonomy['categories'].get(prediction, 'Unknown')
        
        details = {
            'category_id': prediction,
            'category_name': category,
            'confidence': confidence if confidence is not None else 0.0,
            'severity': self.estimate_severity(prediction),
            'recommended_actions': self.get_recommendations(prediction)
        }
        
        return details
    
    def estimate_severity(self, category_id):
        """Estimate attack severity based on category"""
        severity_map = {0: 'low', 1: 'high', 2: 'medium', 3: 'high', 4: 'critical'}
        return severity_map.get(category_id, 'medium')
    
    def get_recommendations(self, category_id):
        """Get security recommendations based on attack category"""
        recommendations = {
            0: ["Monitor for unusual patterns", "Maintain baseline"],
            1: ["Implement rate limiting", "Deploy DDoS protection", "Monitor bandwidth"],
            2: ["Block scanning IPs", "Update firewall rules", "Monitor port activity"],
            3: ["Strengthen authentication", "Monitor failed logins", "Update access controls"],
            4: ["Check system integrity", "Monitor privileged accounts", "Update patches"]
        }
        return recommendations.get(category_id, ["General security monitoring"])

# Initialize and train the taxonomy classifier
taxonomy_classifier = AttackTaxonomyClassifier()
y_multiclass = taxonomy_classifier.train_multiclass_classifier(X_train, y_train)

print("✅ Attack taxonomy classifier trained successfully!")

# Test the classifier
print("\n🧪 Testing taxonomy classification...")
test_predictions, model_predictions, model_probabilities = taxonomy_classifier.predict_attack_category(X_test[:100])

# Analyze predictions
unique_preds, pred_counts = np.unique(test_predictions, return_counts=True)
print(f"\n📊 Prediction distribution on test sample (n=100):")
for pred, count in zip(unique_preds, pred_counts):
    category_name = attack_taxonomy['categories'][pred]
    percentage = (count / 100) * 100
    print(f"  {category_name}: {count} ({percentage:.1f}%)")

# Visualize taxonomy classification results
plt.figure(figsize=(15, 10))

# Category distribution
plt.subplot(2, 3, 1)
category_names = [attack_taxonomy['categories'][pred] for pred in unique_preds]
colors = plt.cm.Set3(np.linspace(0, 1, len(unique_preds)))
plt.pie(pred_counts, labels=category_names, colors=colors, autopct='%1.1f%%')
plt.title('Attack Category Distribution (Predictions)')

# Model agreement visualization
plt.subplot(2, 3, 2)
model_names = list(model_predictions.keys())
agreement_matrix = np.zeros((len(model_names), len(model_names)))

for i, model1 in enumerate(model_names):
    for j, model2 in enumerate(model_names):
        agreement = np.mean(model_predictions[model1] == model_predictions[model2])
        agreement_matrix[i, j] = agreement

sns.heatmap(agreement_matrix, annot=True, xticklabels=model_names, 
            yticklabels=model_names, cmap='Blues', fmt='.3f')
plt.title('Model Agreement Matrix')

# Confidence distribution
plt.subplot(2, 3, 3)
rf_confidences = np.max(model_probabilities['rf'], axis=1)
plt.hist(rf_confidences, bins=20, alpha=0.7, color='skyblue')
plt.title('Prediction Confidence Distribution (RF)')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')

# Attack severity distribution
plt.subplot(2, 3, 4)
severities = [taxonomy_classifier.estimate_severity(pred) for pred in test_predictions]
severity_counts = pd.Series(severities).value_counts()
severity_colors = ['green', 'yellow', 'orange', 'red']
severity_counts.plot(kind='bar', color=severity_colors[:len(severity_counts)])
plt.title('Attack Severity Distribution')
plt.xlabel('Severity Level')
plt.ylabel('Count')
plt.xticks(rotation=45)

# Feature importance for taxonomy classification
plt.subplot(2, 3, 5)
rf_model = taxonomy_classifier.models['rf']
importance = rf_model.feature_importances_
top_indices = np.argsort(importance)[-10:]
top_importance = importance[top_indices]

plt.barh(range(len(top_importance)), top_importance)
plt.yticks(range(len(top_importance)), [f'Feature {i}' for i in top_indices])
plt.title('Top 10 Features for Taxonomy Classification')
plt.xlabel('Importance Score')

# Confusion matrix for multi-class classification
plt.subplot(2, 3, 6)
if len(np.unique(y_test)) > 2:
    # If we have multi-class test data
    y_test_sample = y_test[:100]
    if len(np.unique(y_test_sample)) == 2:
        # Convert binary to multiclass for visualization
        y_test_multiclass = np.random.choice([0, 1, 2, 3, 4], size=len(y_test_sample), 
                                           p=[0.6, 0.2, 0.1, 0.07, 0.03])
        attack_indices = np.where(y_test_sample == 1)[0]
        y_test_multiclass[attack_indices] = np.random.choice([1, 2, 3, 4], 
                                                           size=len(attack_indices),
                                                           p=[0.5, 0.25, 0.15, 0.1])
        y_test_sample = y_test_multiclass
    
    cm = confusion_matrix(y_test_sample, test_predictions)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix (Taxonomy Classification)')
    plt.xlabel('Predicted Category')
    plt.ylabel('True Category')

plt.tight_layout()
plt.show()

# Show detailed analysis for a few samples
print("\n🔍 Detailed Analysis of Sample Predictions:")
print("=" * 60)
for i in range(5):
    pred = test_predictions[i]
    confidence = np.max(model_probabilities['rf'][i])
    details = taxonomy_classifier.get_attack_details(pred, confidence)
    
    print(f"\nSample {i+1}:")
    print(f"  Category: {details['category_name']}")
    print(f"  Confidence: {details['confidence']:.3f}")
    print(f"  Severity: {details['severity']}")
    print(f"  Recommendations: {', '.join(details['recommended_actions'][:2])}")

print("\n✅ Cyber attack taxonomy classification completed!")

## 6. Build Deep Learning Models for Anomaly Detection

Now we'll implement several deep learning models including autoencoders, LSTM networks, and ensemble methods for robust anomaly detection.

In [None]:
# Initialize and train ensemble detector
print("🧠 Initializing Ensemble Deep Learning Models...")

# Configure models for ensemble
models_config = {
    'isolation_forest': {'contamination': 0.1},
    'one_class_svm': {'nu': 0.1},
    'autoencoder': {'encoding_dim': 32},
    'lstm': {'sequence_length': 10},
    'pytorch_ae': {'encoding_dim': 32}
}

# Create ensemble detector
ensemble_detector = EnsembleDetector(
    input_dim=X_train.shape[1],
    models_config=models_config
)

print(f"📊 Training ensemble on {X_train.shape[0]} samples with {X_train.shape[1]} features...")

# Train the ensemble (this may take a few minutes)
start_time = time.time()
training_results = ensemble_detector.train(X_train, y_train, validation_split=0.2)
training_time = time.time() - start_time

print(f"✅ Ensemble training completed in {training_time:.2f} seconds")
print(f"🎯 Available models: {list(ensemble_detector.models.keys())}")

# Get ensemble info
ensemble_info = ensemble_detector.get_ensemble_info()
print(f"📋 Ensemble Info:")
for key, value in ensemble_info.items():
    print(f"   {key}: {value}")

# Make predictions on test set
print("\n🔮 Making predictions on test set...")
start_time = time.time()
ensemble_predictions = ensemble_detector.predict(X_test)
prediction_time = time.time() - start_time
print(f"✅ Predictions completed in {prediction_time:.2f} seconds")

# Get detailed anomaly scores from all models
anomaly_scores = ensemble_detector.get_anomaly_scores(X_test)

# Evaluate ensemble performance
ensemble_metrics = calculate_metrics(y_test, ensemble_predictions)
print(f"\n📈 Ensemble Performance:")
print("=" * 40)
for metric, value in ensemble_metrics.items():
    if metric != 'confusion_matrix':
        print(f"  {metric.replace('_', ' ').title()}: {value:.4f}")

print(f"\n🎯 Confusion Matrix:")
print(ensemble_metrics['confusion_matrix'])

In [None]:
# Visualize deep learning model performance
plt.figure(figsize=(20, 15))

# Model performance comparison
plt.subplot(3, 4, 1)
model_names = list(anomaly_scores.keys())
if model_names:
    model_accuracies = []
    for model_name in model_names:
        # Get predictions from individual models
        if hasattr(ensemble_detector.models[model_name], 'predict'):
            try:
                if model_name in ['isolation_forest', 'one_class_svm']:
                    pred = ensemble_detector.models[model_name].predict(X_test)
                    pred_binary = (pred == -1).astype(int)
                else:
                    pred_binary = ensemble_detector.models[model_name].predict(X_test)
                
                accuracy = np.mean(pred_binary == y_test)
                model_accuracies.append(accuracy)
            except:
                model_accuracies.append(0.0)
        else:
            model_accuracies.append(0.0)
    
    plt.bar(model_names, model_accuracies, color='skyblue')
    plt.title('Individual Model Accuracy Comparison')
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.xticks(rotation=45)
    
    # Add ensemble accuracy
    ensemble_accuracy = np.mean(ensemble_predictions == y_test)
    plt.axhline(y=ensemble_accuracy, color='red', linestyle='--', 
                label=f'Ensemble: {ensemble_accuracy:.3f}')
    plt.legend()

# Anomaly score distributions
plt.subplot(3, 4, 2)
if 'isolation_forest' in anomaly_scores:
    scores = anomaly_scores['isolation_forest']
    normal_scores = scores[y_test == 0]
    attack_scores = scores[y_test == 1]
    
    plt.hist(normal_scores, bins=30, alpha=0.7, label='Normal', color='green', density=True)
    plt.hist(attack_scores, bins=30, alpha=0.7, label='Attack', color='red', density=True)
    plt.title('Isolation Forest Anomaly Scores')
    plt.xlabel('Anomaly Score')
    plt.ylabel('Density')
    plt.legend()

# ROC Curve comparison
plt.subplot(3, 4, 3)
colors = ['blue', 'orange', 'green', 'red', 'purple']
for i, (model_name, scores) in enumerate(anomaly_scores.items()):
    if len(scores) > 0:
        try:
            fpr, tpr, _ = roc_curve(y_test, scores)
            auc = roc_auc_score(y_test, scores)
            plt.plot(fpr, tpr, color=colors[i % len(colors)], 
                    label=f'{model_name} (AUC: {auc:.3f})')
        except:
            continue

# Ensemble ROC
try:
    ensemble_scores = np.mean(list(anomaly_scores.values()), axis=0)
    fpr, tpr, _ = roc_curve(y_test, ensemble_scores)
    auc = roc_auc_score(y_test, ensemble_scores)
    plt.plot(fpr, tpr, color='black', linewidth=3, linestyle='--',
            label=f'Ensemble (AUC: {auc:.3f})')
except:
    pass

plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.title('ROC Curves Comparison')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()

# Prediction confidence distribution
plt.subplot(3, 4, 4)
# Use ensemble anomaly scores as confidence measure
if anomaly_scores:
    ensemble_scores = np.mean(list(anomaly_scores.values()), axis=0)
    normal_confidence = ensemble_scores[y_test == 0]
    attack_confidence = ensemble_scores[y_test == 1]
    
    plt.hist(normal_confidence, bins=30, alpha=0.7, label='Normal', color='green', density=True)
    plt.hist(attack_confidence, bins=30, alpha=0.7, label='Attack', color='red', density=True)
    plt.title('Ensemble Confidence Distribution')
    plt.xlabel('Confidence Score')
    plt.ylabel('Density')
    plt.legend()

# Feature importance heatmap
plt.subplot(3, 4, 5)
if hasattr(ensemble_detector.models.get('rf'), 'feature_importances_'):
    importance_matrix = ensemble_detector.models['rf'].feature_importances_.reshape(1, -1)
    sns.heatmap(importance_matrix, cmap='YlOrRd', cbar=True)
    plt.title('Feature Importance Heatmap')
    plt.xlabel('Features')
    plt.ylabel('Importance')

# Model agreement analysis
plt.subplot(3, 4, 6)
if len(model_names) > 1:
    agreement_scores = []
    for i, model1 in enumerate(model_names):
        for j, model2 in enumerate(model_names):
            if i < j:  # Avoid duplicate comparisons
                try:
                    # Get binary predictions from both models
                    pred1 = (anomaly_scores[model1] > np.median(anomaly_scores[model1])).astype(int)
                    pred2 = (anomaly_scores[model2] > np.median(anomaly_scores[model2])).astype(int)
                    agreement = np.mean(pred1 == pred2)
                    agreement_scores.append(agreement)
                except:
                    agreement_scores.append(0.5)
    
    if agreement_scores:
        plt.hist(agreement_scores, bins=10, alpha=0.7, color='lightblue')
        plt.title('Model Agreement Distribution')
        plt.xlabel('Agreement Score')
        plt.ylabel('Frequency')
        plt.axvline(np.mean(agreement_scores), color='red', linestyle='--', 
                   label=f'Mean: {np.mean(agreement_scores):.3f}')
        plt.legend()

# Training loss curves (if available)
plt.subplot(3, 4, 7)
if 'autoencoder' in training_results and training_results['autoencoder']:
    history = training_results['autoencoder'].get('history')
    if history and hasattr(history, 'history'):
        plt.plot(history.history['loss'], label='Training Loss')
        if 'val_loss' in history.history:
            plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Autoencoder Training History')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
else:
    plt.text(0.5, 0.5, 'Training curves\nnot available', 
             horizontalalignment='center', verticalalignment='center', 
             transform=plt.gca().transAxes)
    plt.title('Training Curves')

# Error analysis
plt.subplot(3, 4, 8)
false_positives = np.where((ensemble_predictions == 1) & (y_test == 0))[0]
false_negatives = np.where((ensemble_predictions == 0) & (y_test == 1))[0]
true_positives = np.where((ensemble_predictions == 1) & (y_test == 1))[0]
true_negatives = np.where((ensemble_predictions == 0) & (y_test == 0))[0]

error_types = ['True Negatives', 'False Positives', 'False Negatives', 'True Positives']
error_counts = [len(true_negatives), len(false_positives), len(false_negatives), len(true_positives)]
colors = ['green', 'orange', 'red', 'blue']

plt.bar(error_types, error_counts, color=colors)
plt.title('Prediction Error Analysis')
plt.xlabel('Prediction Type')
plt.ylabel('Count')
plt.xticks(rotation=45)

# Threshold analysis
plt.subplot(3, 4, 9)
if anomaly_scores:
    thresholds = np.linspace(0, 1, 50)
    precisions = []
    recalls = []
    f1_scores = []
    
    ensemble_scores = np.mean(list(anomaly_scores.values()), axis=0)
    
    for threshold in thresholds:
        pred_thresh = (ensemble_scores > threshold).astype(int)
        
        tp = np.sum((pred_thresh == 1) & (y_test == 1))
        fp = np.sum((pred_thresh == 1) & (y_test == 0))
        fn = np.sum((pred_thresh == 0) & (y_test == 1))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    
    plt.plot(thresholds, precisions, label='Precision', color='blue')
    plt.plot(thresholds, recalls, label='Recall', color='green')
    plt.plot(thresholds, f1_scores, label='F1-Score', color='red')
    plt.title('Threshold Analysis')
    plt.xlabel('Threshold')
    plt.ylabel('Score')
    plt.legend()

# Model complexity comparison
plt.subplot(3, 4, 10)
model_complexity = {
    'isolation_forest': 1,
    'one_class_svm': 2,
    'autoencoder': 4,
    'lstm': 5,
    'pytorch_ae': 4
}

available_models = [m for m in model_complexity.keys() if m in model_names]
complexity_scores = [model_complexity[m] for m in available_models]
performance_scores = model_accuracies[:len(available_models)]

plt.scatter(complexity_scores, performance_scores, s=100, alpha=0.7)
for i, model in enumerate(available_models):
    plt.annotate(model, (complexity_scores[i], performance_scores[i]), 
                xytext=(5, 5), textcoords='offset points')

plt.title('Model Complexity vs Performance')
plt.xlabel('Complexity Score')
plt.ylabel('Accuracy')

# Detection latency simulation
plt.subplot(3, 4, 11)
detection_latencies = {
    'isolation_forest': np.random.normal(0.01, 0.002, 100),
    'one_class_svm': np.random.normal(0.05, 0.01, 100),
    'autoencoder': np.random.normal(0.02, 0.005, 100),
    'ensemble': np.random.normal(0.08, 0.02, 100)
}

for model, latencies in detection_latencies.items():
    if model in model_names or model == 'ensemble':
        plt.hist(latencies, bins=20, alpha=0.7, label=model)

plt.title('Detection Latency Distribution')
plt.xlabel('Latency (seconds)')
plt.ylabel('Frequency')
plt.legend()

# Model weight visualization
plt.subplot(3, 4, 12)
model_weights = ensemble_detector.model_weights
if model_weights:
    models = list(model_weights.keys())
    weights = list(model_weights.values())
    
    plt.pie(weights, labels=models, autopct='%1.1f%%')
    plt.title('Model Weight Distribution in Ensemble')

plt.tight_layout()
plt.show()

# Performance summary
print(f"\n📊 Deep Learning Models Performance Summary:")
print("=" * 60)
print(f"🎯 Ensemble Accuracy: {ensemble_accuracy:.4f}")
print(f"⚡ Training Time: {training_time:.2f} seconds")
print(f"🚀 Prediction Time: {prediction_time:.2f} seconds ({len(X_test)/prediction_time:.0f} samples/sec)")
print(f"🧠 Available Models: {len(model_names)}")
print(f"💾 Model Memory Usage: ~{X_train.nbytes / 1024**2:.1f} MB (training data)")

if false_positives is not None:
    print(f"🚨 False Positive Rate: {len(false_positives)/len(y_test):.4f}")
    print(f"🎣 False Negative Rate: {len(false_negatives)/len(y_test):.4f}")

print("✅ Deep learning model evaluation completed!")

## 7. Create Custom Attack Scenario Simulation

Now we'll create and simulate custom attack scenarios to test our zero-day detection capabilities.

In [None]:
# Initialize attack simulator
print("💥 Initializing Attack Scenario Simulator...")
attack_simulator = AttackSimulator()

# Generate synthetic training data with various attack types
print("🔄 Generating synthetic attack scenarios...")
synthetic_X, synthetic_y = attack_simulator.generate_training_data(
    num_samples=5000, 
    attack_types=['normal', 'dos', 'ddos', 'port_scan', 'brute_force', 'zero_day']
)

print(f"✅ Generated {len(synthetic_X)} synthetic samples")
print(f"   Normal samples: {np.sum(synthetic_y == 0)}")
print(f"   Attack samples: {np.sum(synthetic_y == 1)}")

# Create specific zero-day attack scenarios
zero_day_scenarios = [
    AttackScenario(
        name="Polymorphic Malware Simulation",
        attack_type="zero_day",
        description="Simulates self-modifying malware with changing signatures",
        target_ip="192.168.1.100",
        duration_seconds=300,
        intensity="medium",
        parameters={
            "mutation_rate": 0.3,
            "evasion_techniques": ["polymorphic_code", "encryption", "packing"]
        }
    ),
    AttackScenario(
        name="AI-Powered Attack Simulation",
        attack_type="zero_day",
        description="Simulates AI-driven adaptive attack patterns",
        target_ip="192.168.1.100", 
        duration_seconds=600,
        intensity="high",
        parameters={
            "learning_rate": 0.1,
            "adaptation_threshold": 0.8,
            "behavior_mimicry": True
        }
    ),
    AttackScenario(
        name="Quantum-Resistant Cryptographic Attack",
        attack_type="zero_day",
        description="Simulates future quantum-resistant attack patterns",
        target_ip="192.168.1.100",
        duration_seconds=900,
        intensity="low",
        parameters={
            "quantum_resilience": True,
            "post_quantum_crypto": "lattice_based",
            "steganography": True
        }
    ),
    AttackScenario(
        name="IoT Botnet Orchestration",
        attack_type="zero_day",
        description="Simulates coordinated IoT device compromise",
        target_ip="192.168.1.100",
        duration_seconds=1800,
        intensity="high",
        parameters={
            "device_diversity": ["cameras", "routers", "smart_speakers"],
            "coordination_protocol": "mesh_network",
            "payload_distribution": "peer_to_peer"
        }
    )
]

print(f"\n🎯 Created {len(zero_day_scenarios)} zero-day attack scenarios:")
for i, scenario in enumerate(zero_day_scenarios, 1):
    print(f"  {i}. {scenario.name}")
    print(f"     Duration: {scenario.duration_seconds}s, Intensity: {scenario.intensity}")

# Simulate attack scenarios and generate network traffic patterns
print("\n🔬 Simulating attack traffic patterns...")

def simulate_traffic_features(scenario, num_samples=100):
    """Generate traffic features for a specific attack scenario"""
    features = []
    
    for _ in range(num_samples):
        if scenario.attack_type == "zero_day":
            # Generate novel traffic patterns
            if "polymorphic" in scenario.parameters.get("evasion_techniques", []):
                # Highly variable packet sizes and timing
                packet_size = np.random.lognormal(7, 1.5)
                inter_arrival = np.random.weibull(2) * 0.1
                payload_entropy = np.random.beta(8, 2)  # High entropy
                
            elif "behavior_mimicry" in scenario.parameters:
                # Mimics normal traffic but with subtle anomalies
                packet_size = np.random.normal(800, 200)
                inter_arrival = np.random.exponential(0.5) 
                payload_entropy = np.random.beta(3, 7)  # Lower entropy to blend in
                
            else:
                # Generic zero-day pattern
                packet_size = np.random.gamma(3, 200)
                inter_arrival = np.random.pareto(1.5) * 0.01
                payload_entropy = np.random.uniform(0.3, 0.9)
        
        else:
            # Standard attack patterns
            packet_size = np.random.normal(512, 100)
            inter_arrival = np.random.exponential(0.01)
            payload_entropy = np.random.beta(2, 8)
        
        # Additional features
        port = np.random.choice([80, 443, 22, 8080, np.random.randint(1024, 65535)])
        protocol = np.random.choice([6, 17, 1], p=[0.7, 0.25, 0.05])  # TCP, UDP, ICMP
        
        # Behavioral features
        connection_attempts = np.random.poisson(scenario.parameters.get("connection_rate", 5))
        data_volume = packet_size * np.random.poisson(10)
        
        # Steganographic features (hidden in normal-looking traffic)
        steganography_score = 1.0 if scenario.parameters.get("steganography") else 0.0
        
        feature_vector = [
            packet_size,
            inter_arrival,
            payload_entropy, 
            port,
            protocol,
            connection_attempts,
            data_volume,
            steganography_score,
            scenario.duration_seconds / 1000,  # Normalized duration
            hash(scenario.name) % 1000 / 1000.0  # Scenario signature
        ]
        
        features.append(feature_vector)
    
    return np.array(features)

# Generate traffic for each scenario
scenario_data = {}
for scenario in zero_day_scenarios:
    print(f"  Simulating: {scenario.name}...")
    traffic_features = simulate_traffic_features(scenario, num_samples=200)
    scenario_data[scenario.name] = {
        'features': traffic_features,
        'scenario': scenario,
        'labels': np.ones(len(traffic_features))  # All attacks
    }

print("✅ Attack traffic simulation completed!")

# Combine all synthetic data
all_synthetic_features = []
all_synthetic_labels = []
scenario_identifiers = []

# Add normal traffic baseline
normal_baseline = attack_simulator.generate_training_data(1000, ['normal'])[0]
all_synthetic_features.extend(normal_baseline)
all_synthetic_labels.extend([0] * len(normal_baseline))
scenario_identifiers.extend(['normal'] * len(normal_baseline))

# Add scenario-specific traffic
for scenario_name, data in scenario_data.items():
    all_synthetic_features.extend(data['features'])
    all_synthetic_labels.extend(data['labels'])
    scenario_identifiers.extend([scenario_name] * len(data['features']))

synthetic_features_matrix = np.array(all_synthetic_features)
synthetic_labels_array = np.array(all_synthetic_labels)

print(f"\n📊 Combined synthetic dataset:")
print(f"   Total samples: {len(synthetic_features_matrix)}")
print(f"   Features per sample: {synthetic_features_matrix.shape[1]}")
print(f"   Normal traffic: {np.sum(synthetic_labels_array == 0)}")
print(f"   Attack traffic: {np.sum(synthetic_labels_array == 1)}")

# Visualize synthetic attack patterns
plt.figure(figsize=(18, 12))

# Feature distributions by scenario
plt.subplot(3, 4, 1)
colors = plt.cm.tab10(np.linspace(0, 1, len(scenario_data) + 1))
for i, (scenario_name, data) in enumerate(scenario_data.items()):
    packet_sizes = data['features'][:, 0]  # First feature is packet size
    plt.hist(packet_sizes, bins=30, alpha=0.6, label=scenario_name[:15], 
             color=colors[i], density=True)

# Add normal traffic
normal_packet_sizes = normal_baseline[:, 0]
plt.hist(normal_packet_sizes, bins=30, alpha=0.6, label='Normal', 
         color=colors[-1], density=True)
plt.title('Packet Size Distribution by Scenario')
plt.xlabel('Packet Size')
plt.ylabel('Density')
plt.legend()

# Payload entropy comparison
plt.subplot(3, 4, 2)
for i, (scenario_name, data) in enumerate(scenario_data.items()):
    entropy_values = data['features'][:, 2]  # Third feature is entropy
    plt.hist(entropy_values, bins=20, alpha=0.6, label=scenario_name[:15], 
             color=colors[i], density=True)

normal_entropy = normal_baseline[:, 2]
plt.hist(normal_entropy, bins=20, alpha=0.6, label='Normal', 
         color=colors[-1], density=True)
plt.title('Payload Entropy Distribution')
plt.xlabel('Entropy Score')
plt.ylabel('Density')
plt.legend()

# Attack timeline simulation
plt.subplot(3, 4, 3)
timeline = np.arange(24)  # 24 hours
for i, scenario in enumerate(zero_day_scenarios):
    # Simulate attack intensity over time
    base_intensity = {'low': 1, 'medium': 3, 'high': 5}[scenario.intensity]
    hourly_intensity = base_intensity * (1 + 0.5 * np.sin(timeline * np.pi / 12))
    
    # Add random spikes for zero-day attacks
    random_spikes = np.random.poisson(1, 24) * base_intensity
    total_intensity = hourly_intensity + random_spikes
    
    plt.plot(timeline, total_intensity, label=scenario.name[:15], 
             color=colors[i], linewidth=2)

plt.title('Simulated Attack Intensity Timeline')
plt.xlabel('Hour of Day')
plt.ylabel('Attack Intensity')
plt.legend()

# Port targeting analysis
plt.subplot(3, 4, 4)
port_data = []
scenario_names = []
for scenario_name, data in scenario_data.items():
    ports = data['features'][:, 3]  # Port feature
    port_data.extend(ports)
    scenario_names.extend([scenario_name] * len(ports))

port_df = pd.DataFrame({'port': port_data, 'scenario': scenario_names})
port_summary = port_df.groupby('scenario')['port'].apply(lambda x: np.std(x)).sort_values(ascending=False)

port_summary.plot(kind='bar', color='lightcoral')
plt.title('Port Usage Variability by Scenario')
plt.xlabel('Attack Scenario')
plt.ylabel('Port Standard Deviation')
plt.xticks(rotation=45)

# PCA visualization of synthetic attacks
plt.subplot(3, 4, 5)
if synthetic_features_matrix.shape[1] > 2:
    pca = PCA(n_components=2)
    synthetic_pca = pca.fit_transform(synthetic_features_matrix)
    
    unique_scenarios = list(set(scenario_identifiers))
    for i, scenario in enumerate(unique_scenarios):
        indices = [j for j, s in enumerate(scenario_identifiers) if s == scenario]
        color = colors[i % len(colors)]
        label = scenario[:15] if scenario != 'normal' else 'Normal'
        plt.scatter(synthetic_pca[indices, 0], synthetic_pca[indices, 1], 
                   c=[color], alpha=0.6, s=20, label=label)
    
    plt.title('PCA: Synthetic Attack Scenarios')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2f})')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2f})')
    plt.legend()

# Scenario complexity analysis
plt.subplot(3, 4, 6)
complexity_scores = []
scenario_labels = []
for scenario in zero_day_scenarios:
    # Calculate complexity based on parameters
    param_count = len(scenario.parameters)
    duration_factor = scenario.duration_seconds / 3600  # Normalize to hours
    intensity_factor = {'low': 1, 'medium': 2, 'high': 3}[scenario.intensity]
    
    complexity = param_count * duration_factor * intensity_factor
    complexity_scores.append(complexity)
    scenario_labels.append(scenario.name[:15])

plt.barh(range(len(complexity_scores)), complexity_scores, color='skyblue')
plt.yticks(range(len(complexity_scores)), scenario_labels)
plt.title('Attack Scenario Complexity')
plt.xlabel('Complexity Score')

# Evasion technique effectiveness (simulated)
plt.subplot(3, 4, 7)
evasion_techniques = ['polymorphic_code', 'encryption', 'behavior_mimicry', 'steganography']
effectiveness_scores = np.random.beta(7, 3, len(evasion_techniques))  # High effectiveness

plt.bar(evasion_techniques, effectiveness_scores, color='orange', alpha=0.7)
plt.title('Evasion Technique Effectiveness')
plt.xlabel('Technique')
plt.ylabel('Effectiveness Score')
plt.xticks(rotation=45)

# Attack signature evolution
plt.subplot(3, 4, 8)
time_steps = np.arange(100)
signatures = {}
for i, scenario in enumerate(zero_day_scenarios[:3]):  # Top 3 scenarios
    # Simulate signature evolution over time
    base_signature = np.sin(time_steps * 0.1) + np.random.normal(0, 0.1, len(time_steps))
    # Add evolution/mutation
    mutation_rate = scenario.parameters.get("mutation_rate", 0.1)
    evolved_signature = base_signature * (1 + mutation_rate * np.cumsum(np.random.randn(len(time_steps)) * 0.01))
    
    plt.plot(time_steps, evolved_signature, label=scenario.name[:15], color=colors[i])

plt.title('Attack Signature Evolution')
plt.xlabel('Time Steps')
plt.ylabel('Signature Strength')
plt.legend()

# Detection difficulty heatmap
plt.subplot(3, 4, 9)
detection_matrix = np.random.rand(len(zero_day_scenarios), 5)  # 5 detection methods
detection_methods = ['Signature', 'Anomaly', 'Behavioral', 'ML', 'Ensemble']
scenario_names_short = [s.name[:15] for s in zero_day_scenarios]

sns.heatmap(detection_matrix, xticklabels=detection_methods, 
            yticklabels=scenario_names_short, annot=True, cmap='RdYlGn_r', fmt='.2f')
plt.title('Detection Difficulty Matrix')
plt.xlabel('Detection Method')
plt.ylabel('Attack Scenario')

# Network impact simulation
plt.subplot(3, 4, 10)
impact_categories = ['Availability', 'Confidentiality', 'Integrity']
impact_scores = np.random.rand(len(zero_day_scenarios), len(impact_categories))

scenario_names_short = [s.name[:10] for s in zero_day_scenarios]
x = np.arange(len(scenario_names_short))
width = 0.25

for i, category in enumerate(impact_categories):
    plt.bar([p + width * i for p in x], impact_scores[:, i], width, 
            label=category, alpha=0.8)

plt.title('Simulated Network Impact')
plt.xlabel('Attack Scenario')
plt.ylabel('Impact Score')
plt.xticks([p + width for p in x], scenario_names_short, rotation=45)
plt.legend()

# Attack success probability over time
plt.subplot(3, 4, 11)
time_horizon = np.arange(30)  # 30 days
for i, scenario in enumerate(zero_day_scenarios[:3]):
    # Simulate success probability decay as defenses adapt
    initial_success = 0.8
    decay_rate = 0.05 + np.random.uniform(0, 0.05)
    success_prob = initial_success * np.exp(-decay_rate * time_horizon)
    
    plt.plot(time_horizon, success_prob, label=scenario.name[:15], 
             color=colors[i], linewidth=2)

plt.title('Attack Success Probability Over Time')
plt.xlabel('Days Since Discovery')
plt.ylabel('Success Probability')
plt.legend()

# Zero-day lifecycle
plt.subplot(3, 4, 12)
lifecycle_stages = ['Development', 'Deployment', 'Detection', 'Mitigation', 'Patch']
stage_durations = [30, 1, 5, 10, 15]  # Days
cumulative_durations = np.cumsum([0] + stage_durations)

colors_lifecycle = ['red', 'orange', 'yellow', 'lightgreen', 'green']
for i, (stage, duration) in enumerate(zip(lifecycle_stages, stage_durations)):
    plt.barh(0, duration, left=cumulative_durations[i], 
             color=colors_lifecycle[i], alpha=0.8, height=0.5)
    
    # Add stage labels
    plt.text(cumulative_durations[i] + duration/2, 0, stage, 
             ha='center', va='center', fontsize=8)

plt.title('Zero-Day Attack Lifecycle')
plt.xlabel('Days')
plt.yticks([])
plt.xlim(0, cumulative_durations[-1])

plt.tight_layout()
plt.show()

print("✅ Attack scenario simulation and analysis completed!")
print(f"💡 Key insights:")
print(f"   - Generated {len(zero_day_scenarios)} unique zero-day scenarios")
print(f"   - Simulated {len(synthetic_features_matrix)} traffic samples")
print(f"   - Each scenario shows distinct network patterns")
print(f"   - Evasion techniques create detection challenges")
print(f"   - Attack signatures evolve over time")