# Volume 2, Chapter 25: Anomaly Detection with AI

**Build production-ready anomaly detection for network operations**

From: AI for Networking Engineers - Volume 2, Chapter 25

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/eduardd76/AI_for_networking_and_security_engineers/blob/main/CODE/Colab-Notebooks/Vol2_Ch25_Anomaly_Detection.ipynb)

## Overview

This notebook demonstrates:
1. **Statistical Anomaly Detection** - Z-score, IQR methods for quick anomaly checks
2. **ML-Based Detection** - Isolation Forest for unsupervised anomaly detection
3. **Time-Series Detection** - Prophet for seasonality-aware anomaly detection
4. **LLM Explanation** - Use Claude to explain what anomalies mean

**Real Impact**: Reduce false positive alerts by 90%, detect attacks in seconds instead of days.

## Setup

In [None]:
# Install dependencies
!pip install -q anthropic langchain-anthropic scikit-learn prophet pandas numpy

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from dataclasses import dataclass
from typing import List, Dict, Tuple
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set API key
import os
from google.colab import userdata

# Get API key from Colab secrets
os.environ['ANTHROPIC_API_KEY'] = userdata.get('ANTHROPIC_API_KEY')

## Example 1: Statistical Anomaly Detection (Z-Score)

Fast, simple method for detecting outliers in bandwidth/latency data.

**When to use**: Quick checks, real-time monitoring, simple patterns.

In [None]:
@dataclass
class Anomaly:
    """Detected anomaly."""
    timestamp: datetime
    metric: str
    value: float
    expected_range: Tuple[float, float]
    severity: str
    method: str

    def __str__(self):
        return (f"Anomaly: {self.metric}={self.value:.2f} "
                f"(expected {self.expected_range[0]:.2f}-{self.expected_range[1]:.2f}) "
                f"at {self.timestamp} [{self.severity}]")

class StatisticalAnomalyDetector:
    """Statistical anomaly detection using Z-score and IQR methods."""

    def __init__(self, z_threshold: float = 3.0, iqr_multiplier: float = 1.5):
        self.z_threshold = z_threshold
        self.iqr_multiplier = iqr_multiplier

    def detect_zscore(self, data: np.ndarray, values: np.ndarray) -> List[Anomaly]:
        """
        Detect anomalies using Z-score method.
        Z-score = (value - mean) / std_dev
        Anomaly if |Z-score| > threshold
        """
        mean = np.mean(data)
        std = np.std(data)

        if std == 0:
            return []  # No variation in data

        anomalies = []
        for idx, value in enumerate(values):
            z_score = abs((value - mean) / std)

            if z_score > self.z_threshold:
                lower = mean - (self.z_threshold * std)
                upper = mean + (self.z_threshold * std)

                if z_score > self.z_threshold * 2:
                    severity = "critical"
                elif z_score > self.z_threshold * 1.5:
                    severity = "high"
                else:
                    severity = "medium"

                anomalies.append(Anomaly(
                    timestamp=datetime.now(),
                    metric="bandwidth",
                    value=value,
                    expected_range=(max(0, lower), upper),
                    severity=severity,
                    method="z-score"
                ))

        return anomalies

# Test with synthetic network data
np.random.seed(42)

# Normal traffic: 100 Mbps Â± 20 Mbps
normal_traffic = np.random.normal(100, 20, 1000)

# Test data with anomalies
test_data = np.array([
    95, 105, 98, 102, 110,  # Normal
    250,  # Anomaly: spike (DDoS?)
    105, 98, 95, 102,  # Normal
    15,   # Anomaly: drop (link down?)
    100, 105, 98  # Normal
])

detector = StatisticalAnomalyDetector(z_threshold=3.0)

print("=== Z-Score Detection ===")
anomalies = detector.detect_zscore(normal_traffic, test_data)
for anomaly in anomalies:
    print(anomaly)

print(f"\nDetected {len(anomalies)} anomalies out of {len(test_data)} data points")
print(f"Anomaly rate: {len(anomalies)/len(test_data)*100:.1f}%")

## Example 2: ML-Based Anomaly Detection (Isolation Forest)

Unsupervised ML that detects complex patterns without labeled data.

**When to use**: Multi-dimensional data (bandwidth + latency + CPU + memory), complex patterns.

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

@dataclass
class MLAnomaly:
    """ML-detected anomaly with context."""
    timestamp: datetime
    features: Dict[str, float]
    anomaly_score: float
    severity: str
    method: str

class MLAnomalyDetector:
    """ML-based anomaly detection using Isolation Forest."""

    def __init__(self, contamination: float = 0.1):
        self.contamination = contamination
        self.scaler = StandardScaler()
        self.isolation_forest = None
        self.feature_names = None

    def train_isolation_forest(self, X: np.ndarray, feature_names: List[str] = None):
        """Train Isolation Forest model."""
        self.feature_names = feature_names or [f"feature_{i}" for i in range(X.shape[1])]
        X_scaled = self.scaler.fit_transform(X)

        self.isolation_forest = IsolationForest(
            contamination=self.contamination,
            random_state=42,
            n_estimators=100
        )
        self.isolation_forest.fit(X_scaled)

    def detect_isolation_forest(self, X: np.ndarray, timestamps: List[datetime] = None) -> List[MLAnomaly]:
        """Detect anomalies using trained Isolation Forest."""
        if self.isolation_forest is None:
            raise ValueError("Model not trained. Call train_isolation_forest first.")

        X_scaled = self.scaler.transform(X)
        predictions = self.isolation_forest.predict(X_scaled)
        scores = self.isolation_forest.score_samples(X_scaled)

        anomalies = []
        for idx, (pred, score) in enumerate(zip(predictions, scores)):
            if pred == -1:  # Anomaly detected
                if score < -0.5:
                    severity = "critical"
                elif score < -0.3:
                    severity = "high"
                else:
                    severity = "medium"

                features = {
                    name: float(X[idx, i])
                    for i, name in enumerate(self.feature_names)
                }

                timestamp = timestamps[idx] if timestamps else datetime.now()

                anomalies.append(MLAnomaly(
                    timestamp=timestamp,
                    features=features,
                    anomaly_score=float(score),
                    severity=severity,
                    method="isolation-forest"
                ))

        return anomalies

# Simulate network metrics (multi-dimensional)
np.random.seed(42)
n_samples = 1000

# Normal behavior
normal_data = np.column_stack([
    np.random.normal(100, 15, n_samples),      # Bandwidth (Mbps)
    np.random.normal(5, 1, n_samples),         # Latency (ms)
    np.random.normal(0.01, 0.005, n_samples),  # Packet loss (%)
    np.random.normal(50, 10, n_samples),       # CPU (%)
    np.random.normal(60, 5, n_samples)         # Memory (%)
])

# Test data with anomalies
test_data = np.array([
    [105, 5.2, 0.012, 52, 61],  # Normal
    [98, 4.8, 0.008, 48, 59],   # Normal
    [350, 45, 8.5, 95, 92],     # Anomaly: DDoS attack
    [102, 5.1, 0.011, 51, 60],  # Normal
    [15, 2.1, 0.005, 10, 25],   # Anomaly: Device reboot
    [100, 5.0, 0.010, 50, 61],  # Normal
])

feature_names = ['bandwidth_mbps', 'latency_ms', 'packet_loss_pct', 'cpu_pct', 'memory_pct']

# Train and detect
detector = MLAnomalyDetector(contamination=0.1)
detector.train_isolation_forest(normal_data, feature_names)

print("=== Isolation Forest Detection ===")
anomalies_if = detector.detect_isolation_forest(test_data)

for anomaly in anomalies_if:
    print(f"\n{anomaly.severity.upper()} Anomaly (score: {anomaly.anomaly_score:.3f})")
    print(f"Features:")
    for k, v in anomaly.features.items():
        print(f"  {k}: {v:.2f}")

print(f"\nDetected {len(anomalies_if)} anomalies in {len(test_data)} samples")

## Example 3: Time-Series Anomaly Detection (Prophet)

Handles daily/weekly/seasonal patterns automatically.

**When to use**: Time-series data with patterns (traffic spikes during business hours, low on weekends).

In [None]:
from prophet import Prophet

@dataclass
class TimeSeriesAnomaly:
    """Time-series anomaly with forecast context."""
    timestamp: datetime
    actual_value: float
    predicted_value: float
    lower_bound: float
    upper_bound: float
    deviation_pct: float
    severity: str

class ProphetAnomalyDetector:
    """Time-series anomaly detection using Facebook Prophet."""

    def __init__(self, interval_width: float = 0.95):
        self.interval_width = interval_width
        self.model = Prophet(
            interval_width=interval_width,
            daily_seasonality=True,
            weekly_seasonality=True,
            yearly_seasonality=False  # Not enough data for yearly
        )
        self.trained = False

    def train(self, timestamps: List[datetime], values: List[float]):
        """Train Prophet model on historical data."""
        df = pd.DataFrame({'ds': timestamps, 'y': values})
        self.model.fit(df)
        self.trained = True

    def detect_anomalies(self, timestamps: List[datetime], values: List[float]) -> List[TimeSeriesAnomaly]:
        """Detect anomalies in time-series data."""
        if not self.trained:
            raise ValueError("Model not trained. Call train() first.")

        df = pd.DataFrame({'ds': timestamps})
        forecast = self.model.predict(df)

        anomalies = []
        for idx, (ts, actual) in enumerate(zip(timestamps, values)):
            predicted = forecast.iloc[idx]['yhat']
            lower = forecast.iloc[idx]['yhat_lower']
            upper = forecast.iloc[idx]['yhat_upper']

            # Check if outside prediction interval
            if actual < lower or actual > upper:
                if actual > upper:
                    deviation_pct = ((actual - upper) / upper) * 100
                else:
                    deviation_pct = ((lower - actual) / lower) * 100

                if deviation_pct > 50:
                    severity = "critical"
                elif deviation_pct > 25:
                    severity = "high"
                else:
                    severity = "medium"

                anomalies.append(TimeSeriesAnomaly(
                    timestamp=ts,
                    actual_value=actual,
                    predicted_value=predicted,
                    lower_bound=lower,
                    upper_bound=upper,
                    deviation_pct=deviation_pct,
                    severity=severity
                ))

        return anomalies

# Generate synthetic bandwidth data with patterns
np.random.seed(42)

# 30 days of hourly data
hours = 24 * 30
start_time = datetime(2024, 1, 1, 0, 0, 0)
timestamps = [start_time + timedelta(hours=i) for i in range(hours)]

# Generate realistic traffic pattern
values = []
for ts in timestamps:
    hour = ts.hour
    day_of_week = ts.weekday()

    base = 100

    # Daily pattern (business hours)
    if 8 <= hour <= 18:
        daily_boost = 50
    else:
        daily_boost = 0

    # Weekly pattern (weekends lower)
    if day_of_week >= 5:
        weekly_factor = 0.6
    else:
        weekly_factor = 1.0

    noise = np.random.normal(0, 10)
    value = (base + daily_boost) * weekly_factor + noise
    values.append(max(0, value))

# Add anomalies
values[100] = 300  # Spike
values[200] = 20   # Drop
values[500] = 280  # Another spike

# Split train/test
train_size = int(0.8 * len(values))
train_timestamps = timestamps[:train_size]
train_values = values[:train_size]
test_timestamps = timestamps[train_size:]
test_values = values[train_size:]

# Train and detect
print("Training Prophet model (this may take a minute)...")
detector = ProphetAnomalyDetector(interval_width=0.95)
detector.train(train_timestamps, train_values)

print("\n=== Prophet Anomaly Detection ===")
anomalies = detector.detect_anomalies(test_timestamps, test_values)

print(f"Found {len(anomalies)} anomalies in {len(test_timestamps)} data points")
print(f"Anomaly rate: {len(anomalies)/len(test_timestamps)*100:.1f}%\n")

for anomaly in anomalies[:5]:  # Show first 5
    print(f"{anomaly.severity.upper()} at {anomaly.timestamp}")
    print(f"  Actual: {anomaly.actual_value:.1f} Mbps")
    print(f"  Expected: {anomaly.predicted_value:.1f} Mbps ({anomaly.lower_bound:.1f} - {anomaly.upper_bound:.1f})")
    print(f"  Deviation: {anomaly.deviation_pct:.1f}%\n")

## Example 4: LLM-Powered Anomaly Explanation

Combine ML detection with Claude explanation for actionable insights.

**The Power Move**: ML detects fast, LLM explains why it matters and what to do.

In [None]:
from anthropic import Anthropic

class AnomalyExplainer:
    """Use LLM to explain detected anomalies."""

    def __init__(self, api_key: str):
        self.client = Anthropic(api_key=api_key)

    def explain_anomaly(self, features: Dict[str, float], baseline: Dict[str, float], context: str = "") -> Dict:
        """
        Generate explanation for an anomaly.
        """
        current_str = "\n".join([f"- {k}: {v:.2f}" for k, v in features.items()])
        baseline_str = "\n".join([f"- {k}: {v:.2f}" for k, v in baseline.items()])

        prompt = f"""You are a network operations expert. Analyze this anomaly and provide actionable insights.

Current Values (ANOMALOUS):
{current_str}

Expected Normal Values:
{baseline_str}

Additional Context:
{context or 'No additional context provided'}

Provide:
1. EXPLANATION: What is abnormal and why it matters
2. ROOT CAUSE: Most likely cause(s) of this anomaly
3. RECOMMENDED ACTIONS: Specific steps to investigate or remediate (numbered list)

Be specific to networking. Reference actual metrics, protocols, and troubleshooting commands."""

        response = self.client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=2000,
            messages=[{"role": "user", "content": prompt}]
        )

        return response.content[0].text

# Example: Explain a DDoS attack
explainer = AnomalyExplainer(api_key=os.environ['ANTHROPIC_API_KEY'])

# Anomaly detected
anomaly_features = {
    'bandwidth_mbps': 450.0,
    'latency_ms': 85.0,
    'packet_loss_pct': 12.5,
    'cpu_pct': 98.0,
    'memory_pct': 92.0,
    'connections_per_sec': 15000
}

baseline_features = {
    'bandwidth_mbps': 100.0,
    'latency_ms': 5.0,
    'packet_loss_pct': 0.01,
    'cpu_pct': 45.0,
    'memory_pct': 60.0,
    'connections_per_sec': 500
}

context = "Time: 03:45 AM (maintenance window). Device: core-router-01"

print("=== Anomaly Explanation from Claude ===")
explanation = explainer.explain_anomaly(anomaly_features, baseline_features, context)
print(explanation)

## Summary

You now have 4 anomaly detection methods:

1. **Statistical (Z-Score)** - Fast, simple, good for quick checks
2. **ML (Isolation Forest)** - Handles multi-dimensional data, unsupervised
3. **Time-Series (Prophet)** - Handles seasonality, best for time-series data
4. **LLM Explanation** - Makes anomalies actionable with context

**Production Strategy**:
- Start with Statistical for real-time monitoring
- Use Isolation Forest for complex multi-metric analysis
- Use Prophet for capacity planning and trending
- Always explain critical anomalies with LLM

**Real Impact**: Reduce false positives by 90%, detect attacks in seconds instead of days.