# üìä Prompt Injection Detection Benchmark

Interactive notebook for benchmarking prompt injection detection against standardized public datasets.

**Datasets:**
- üî¥ **SaTML CTF 2024** - Real adversarial attacks from competition
- üü° **deepset/prompt-injections** - Diverse injection attempts
- üü¢ **NotInject** - Benign samples with trigger words (over-defense testing)
- üîµ **LLMail-Inject** - Email-based injection scenarios

In [None]:
import sys
import os
from pathlib import Path

# Add current directory to path so local modules import correctly
sys.path.insert(0, os.path.abspath('.'))

import structlog
import pandas as pd
from tqdm.notebook import tqdm

# Configure Logging
structlog.configure(
    processors=[
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.dev.ConsoleRenderer()
    ]
)

# Custom Imports
from benchmarks import (
    BenchmarkRunner, 
    # specific loaders not strictly needed if BenchmarkRunner handles them, 
    # but good to have if you want to inspect data manually later
    load_satml_dataset, 
    AVAILABLE_DATASETS 
)
from src.detection.embedding_classifier import EmbeddingClassifier

# --- Configuration ---
MODELS_DIR = Path('models')

# Find models
model_files = sorted(list(MODELS_DIR.glob('*_classifier.json')))

print(f"üîç Found {len(model_files)} models in {MODELS_DIR}:")
for m in model_files:
    print(f"  ‚Ä¢ {m.name}")

: 

## ‚öôÔ∏è Configuration

In [None]:
results_data = []

# Define datasets to test
DATASETS_TO_TEST = ['satml', 'deepset', 'llmail']
LIMIT = 300

print(f"üöÄ Starting benchmarks on {len(model_files)} models...")

for model_path in tqdm(model_files, desc="Benchmarking"):
    try:
        # Load model
        det = EmbeddingClassifier()
        det.load_model(str(model_path))
        
        # Initialize Runner
        # We pass threshold=0.5 as default
        run = BenchmarkRunner(det, threshold=0.5)
        
        # Run Benchmark
        # verbose=False helps keep the notebook clean
        res = run.run_all(
            limit_per_dataset=LIMIT,
            include_datasets=DATASETS_TO_TEST,
            verbose=False
        )
        
        # Collect Data
        results_data.append({
            'Model': model_path.stem.replace('_classifier', ''),
            'Accuracy': res.overall_accuracy,
            'FPR': res.overall_fpr,
            'TP': res.overall_tp,
            'FP': res.overall_fp,
            'Total': res.total_samples
        })
        
    except Exception as e:
        structlog.get_logger().error("benchmark_failed", model=model_path.name, error=str(e))

print("‚úÖ Benchmarking complete.")

## ü§ñ Load Model

In [None]:
# Load the detector
detector = EmbeddingClassifier()
detector.load_model(MODEL_PATH)

print(f"‚úÖ Model loaded: {MODEL_PATH}")
print(f"   Trained: {detector.is_trained}")
print(f"   Threshold: {detector.threshold}")

## üìÇ Load Datasets

In [None]:
# List available datasets
print("Available Datasets:")
print("=" * 50)
for key, info in AVAILABLE_DATASETS.items():
    print(f"  {key}: {info['name']} ({info['type']})")

In [None]:
import sys
import os
from pathlib import Path

# Add current directory to path so local modules import correctly
sys.path.insert(0, os.path.abspath('.'))

import structlog
import pandas as pd
from tqdm.notebook import tqdm

# Configure Logging
structlog.configure(
    processors=[
        structlog.processors.TimeStamper(fmt="iso"),
        structlog.dev.ConsoleRenderer()
    ]
)

# Custom Imports
from benchmarks import (
    BenchmarkRunner, 
    # specific loaders not strictly needed if BenchmarkRunner handles them, 
    # but good to have if you want to inspect data manually later
    load_satml_dataset, 
    AVAILABLE_DATASETS 
)
from src.detection.embedding_classifier import EmbeddingClassifier

# --- Configuration ---
MODELS_DIR = Path('models')

# Find models
model_files = sorted(list(MODELS_DIR.glob('*_classifier.json')))

print(f"üîç Found {len(model_files)} models in {MODELS_DIR}:")
for m in model_files:
    print(f"  ‚Ä¢ {m.name}")

## üèÉ Run Benchmark

In [None]:
results_data = []

# Define datasets to test
DATASETS_TO_TEST = ['satml', 'deepset', 'llmail']
LIMIT = 300

print(f"üöÄ Starting benchmarks on {len(model_files)} models...")

for model_path in tqdm(model_files, desc="Benchmarking"):
    try:
        # Load model
        det = EmbeddingClassifier()
        det.load_model(str(model_path))
        
        # Initialize Runner
        run = BenchmarkRunner(det, threshold=0.5)
        
        # Run Benchmark
        res = run.run_all(
            limit_per_dataset=LIMIT,
            include_datasets=DATASETS_TO_TEST,
            verbose=False
        )
        
        # Collect Data
        # FIX: Removed 'TP', 'FP', and 'Total' to prevent AttributeError
        results_data.append({
            'Model': model_path.stem.replace('_classifier', ''),
            'Accuracy': res.overall_accuracy,
            'FPR': res.overall_fpr,
            'Object': res  # Store the full object if we need deep inspection later
        })
        
    except Exception as e:
        structlog.get_logger().error("benchmark_failed", model=model_path.name, error=str(e))

print(f"‚úÖ Benchmarking complete. Collected {len(results_data)} results.")

## üìà Results

In [None]:
# Create DataFrame
df = pd.DataFrame(results_data)

if df.empty:
    print("‚ö†Ô∏è No results collected. Check the logs above for errors.")
else:
    # Drop the raw 'Object' column for the display table
    display_df = df.drop(columns=['Object'])
    
    # Sort by Accuracy
    df_sorted = display_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

    # Styling
    display(df_sorted.style.format({
        'Accuracy': '{:.1%}',
        'FPR': '{:.1%}'
    }).background_gradient(subset=['Accuracy'], cmap='Greens')
      .background_gradient(subset=['FPR'], cmap='Reds'))

In [None]:
!pip install openai
import os
import numpy as np
from typing import List
from openai import AzureOpenAI
from sklearn.linear_model import LogisticRegression
from tqdm.notebook import tqdm

# --- CONFIGURATION ---
# ‚ö†Ô∏è Replace 'YOUR_KEY_HERE' with your actual Azure API Key
AZURE_API_KEY = "7awsW9wUrULKH9CLPkeh1oeaeeGmKvIotw8HSYjVtjMcIgj0NqyLJQQJ99BIACYeBjFXJ3w3AAABACOG5UHM" 

# Derived from your URL: https://goodwiinzapi.cognitiveservices.azure.com/...
AZURE_ENDPOINT = "https://goodwiinzapi.cognitiveservices.azure.com/"
API_VERSION = "2023-05-15"
DEPLOYMENT_NAME = "text-embedding-3-small" 

class AzureEmbeddingClassifier:
    """
    A custom detector using Azure OpenAI's text-embedding-3-small.
    """
    def __init__(self, deployment_name=DEPLOYMENT_NAME, threshold=0.5):
        self.client = AzureOpenAI(
            api_key=AZURE_API_KEY,
            api_version=API_VERSION,
            azure_endpoint=AZURE_ENDPOINT
        )
        self.deployment_name = deployment_name
        self.threshold = threshold
        self.classifier = None 
        self.is_trained = False

    def get_embeddings(self, texts: List[str], batch_size=100) -> np.ndarray:
        """Fetch embeddings from Azure OpenAI in batches."""
        embeddings = []
        # Loop through data in batches to avoid hitting API limits
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            # Sanitize input (newlines can affect performance)
            batch = [t.replace("\n", " ") for t in batch]
            
            response = self.client.embeddings.create(
                input=batch,
                model=self.deployment_name # In Azure, 'model' = deployment name
            )
            # Extract vectors
            batch_embs = [d.embedding for d in response.data]
            embeddings.extend(batch_embs)
        return np.array(embeddings)

    def train(self, texts: List[str], labels: List[int]):
        """Train a lightweight classifier on top of embeddings."""
        print(f"üîπ Generating embeddings for {len(texts)} training samples...")
        try:
            X = self.get_embeddings(texts)
            y = np.array(labels)
            
            print("üîπ Fitting Logistic Regression...")
            self.classifier = LogisticRegression(class_weight='balanced', max_iter=1000)
            self.classifier.fit(X, y)
            self.is_trained = True
            print("‚úÖ Training complete.")
        except Exception as e:
            print(f"‚ùå Training failed: {e}")

    def predict_proba(self, texts: List[str]) -> np.ndarray:
        """Returns probability of class 1 (Injection)."""
        if not self.classifier:
            raise ValueError("Model not trained yet!")
        
        X = self.get_embeddings(texts)
        return self.classifier.predict_proba(X)[:, 1]

    # Interface for BenchmarkRunner
    def predict_score(self, texts: List[str]) -> np.ndarray:
        return self.predict_proba(texts)

# 1. Load Training Data (Deepset)
print("üìö Loading training data (Deepset)...")
from benchmarks import load_deepset_dataset # Ensure explicit import
train_data = load_deepset_dataset(limit=200) 

# FIX: Check data format dynamically
if len(train_data) > 0:
    first_item = train_data[0]
    # If it's a tuple (text, label)
    if isinstance(first_item, tuple):
        print(f"   Detected data format: Tuple")
        train_texts = [d[0] for d in train_data]
        train_labels = [d[1] for d in train_data]
    # If it's a dictionary {'text': ..., 'label': ...}
    elif isinstance(first_item, dict):
        print(f"   Detected data format: Dict")
        train_texts = [d['text'] for d in train_data]
        train_labels = [d['label'] for d in train_data]
    else:
        raise ValueError(f"Unknown data format: {type(first_item)}")
else:
    raise ValueError("Training data is empty!")

print(f"   Loaded {len(train_texts)} samples.")

# 2. Train the Model
azure_detector = AzureEmbeddingClassifier()
azure_detector.train(train_texts, train_labels)

# 2. Train the Model
azure_detector = AzureEmbeddingClassifier()
azure_detector.train(train_texts, train_labels)

# 3. Benchmark (Test on SaTML and LLMail)
if azure_detector.is_trained:
    print(f"\nüöÄ Benchmarking Azure Deployment: {DEPLOYMENT_NAME}...")
    
    # We test on DIFFERENT datasets to see if it generalizes well
    runner = BenchmarkRunner(azure_detector, threshold=0.5)
    results = runner.run_all(
        limit_per_dataset=100, 
        include_datasets=['satml', 'llmail'], 
        verbose=False
    )

    print(f"\nüìä Final Results:")
    print(f"   Accuracy: {results.overall_accuracy:.1%}")
    print(f"   FPR:      {results.overall_fpr:.1%}")

## üìä Visualizations

In [None]:
import matplotlib.pyplot as plt

# Accuracy comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

datasets_names = list(results.results.keys())
colors = ['#4CAF50' if m.accuracy >= 0.95 else '#FFC107' if m.accuracy >= 0.80 else '#F44336' 
          for m in results.results.values()]

# Accuracy
accuracies = [m.accuracy * 100 for m in results.results.values()]
axes[0].barh(datasets_names, accuracies, color=colors)
axes[0].axvline(x=95, color='red', linestyle='--', label='Target (95%)')
axes[0].set_xlabel('Accuracy (%)')
axes[0].set_title('Accuracy by Dataset')
axes[0].set_xlim(0, 105)
axes[0].legend()

# FPR
fprs = [m.false_positive_rate * 100 for m in results.results.values()]
fpr_colors = ['#4CAF50' if m.false_positive_rate <= 0.05 else '#F44336' 
              for m in results.results.values()]
axes[1].barh(datasets_names, fprs, color=fpr_colors)
axes[1].axvline(x=5, color='red', linestyle='--', label='Target (5%)')
axes[1].set_xlabel('False Positive Rate (%)')
axes[1].set_title('FPR by Dataset')
axes[1].legend()

# Latency
latencies = [m.latency_p95 for m in results.results.values()]
lat_colors = ['#4CAF50' if m.latency_p95 <= 100 else '#F44336' 
              for m in results.results.values()]
axes[2].barh(datasets_names, latencies, color=lat_colors)
axes[2].axvline(x=100, color='red', linestyle='--', label='Target (100ms)')
axes[2].set_xlabel('Latency P95 (ms)')
axes[2].set_title('Latency by Dataset')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Overall confusion matrix
import numpy as np

total_tp = sum(m.true_positives for m in results.results.values())
total_tn = sum(m.true_negatives for m in results.results.values())
total_fp = sum(m.false_positives for m in results.results.values())
total_fn = sum(m.false_negatives for m in results.results.values())

cm = np.array([[total_tn, total_fp], [total_fn, total_tp]])

fig, ax = plt.subplots(figsize=(6, 5))
im = ax.imshow(cm, cmap='Blues')

# Labels
ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(['Predicted Safe', 'Predicted Injection'])
ax.set_yticklabels(['Actual Safe', 'Actual Injection'])

# Text annotations
for i in range(2):
    for j in range(2):
        text = ax.text(j, i, cm[i, j], ha="center", va="center", 
                       color="white" if cm[i,j] > cm.max()/2 else "black", fontsize=14)

ax.set_title('Overall Confusion Matrix')
plt.colorbar(im)
plt.tight_layout()
plt.show()

print(f"\nTotal Samples: {cm.sum():,}")
print(f"Correct: {total_tp + total_tn:,} ({(total_tp + total_tn) / cm.sum():.1%})")
print(f"Errors: {total_fp + total_fn:,} ({(total_fp + total_fn) / cm.sum():.1%})")

## üíæ Export Results

In [None]:
# Export to JSON
results.save("benchmark_results.json")
print("‚úÖ Results saved to benchmark_results.json")

In [None]:
# Export to Markdown
reporter.save("benchmark_report.md", format="markdown")
print("‚úÖ Report saved to benchmark_report.md")

## üîÑ Compare Multiple Models

Run this section to compare different models side-by-side.

In [None]:
# Compare multiple models
MODELS_TO_COMPARE = [
    "models/comprehensive_classifier.json",
    "models/all-MiniLM-L6-v2_classifier.json",
    "models/mof_classifier.json",  # Uncomment after training MOF model
]

comparison_results = {}

for model_path in MODELS_TO_COMPARE:
    if not os.path.exists(model_path):
        print(f"‚ö†Ô∏è Skipping {model_path} (not found)")
        continue
        
    print(f"\nüìä Benchmarking: {model_path}")
    
    # Load model
    det = EmbeddingClassifier()
    det.load_model(model_path)
    
    # Run benchmark (quick)
    run = BenchmarkRunner(det, threshold=0.5)
    res = run.run_all(
        limit_per_dataset=200,  # Quick comparison
        include_datasets=["satml", "deepset"],  # Quick subset
        verbose=False
    )
    
    comparison_results[model_path] = res
    print(f"   Accuracy: {res.overall_accuracy:.1%} | FPR: {res.overall_fpr:.1%}")

In [None]:
# Comparison chart
if comparison_results:
    model_names = [os.path.basename(p).replace('.json', '') for p in comparison_results.keys()]
    accuracies = [r.overall_accuracy * 100 for r in comparison_results.values()]
    fprs = [r.overall_fpr * 100 for r in comparison_results.values()]
    
    fig, ax = plt.subplots(figsize=(10, 5))
    x = np.arange(len(model_names))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, accuracies, width, label='Accuracy', color='#4CAF50')
    bars2 = ax.bar(x + width/2, fprs, width, label='FPR', color='#F44336')
    
    ax.set_ylabel('Percentage (%)')
    ax.set_title('Model Comparison')
    ax.set_xticks(x)
    ax.set_xticklabels(model_names, rotation=15)
    ax.legend()
    ax.axhline(y=95, color='green', linestyle='--', alpha=0.5, label='Accuracy Target')
    ax.axhline(y=5, color='red', linestyle='--', alpha=0.5, label='FPR Target')
    
    plt.tight_layout()
    plt.show()