# Agent Testing Notebook

This notebook allows you to test each agent in the workflow individually with editable prompts.

## Workflow Steps:
0. **Junk Filter** - Filters content using ContentFilter
1. **LLM Rank Article** - Ranks article for huntability (1-10)
2. **Extract Agent** - Extracts behavioral observables and IOCs
3. **Generate SIGMA** - Generates SIGMA detection rules
4. **Similarity Search** - Searches for similar existing rules
5. **Promote to Queue** - Promotes rules to queue if similarity is low

In [3]:
# Enable text wrapping in notebook output and code cellsfrom IPython.display import HTML, displaydisplay(HTML("""<style>    /* Wrap output text */    .output_text {        white-space: pre-wrap !important;        word-wrap: break-word !important;        overflow-wrap: break-word !important;    }    .jp-OutputArea-output {        white-space: pre-wrap !important;        word-wrap: break-word !important;        overflow-wrap: break-word !important;    }    pre {        white-space: pre-wrap !important;        word-wrap: break-word !important;        overflow-wrap: break-word !important;    }    .jp-CodeCell-output pre {        white-space: pre-wrap !important;        word-wrap: break-word !important;    }        /* Wrap code cell source */    .CodeMirror-line {        white-space: pre-wrap !important;        word-wrap: break-word !important;        overflow-wrap: break-word !important;    }    .jp-CodeMirror-editor {        white-space: pre-wrap !important;        word-wrap: break-word !important;    }    .CodeMirror {        word-wrap: break-word !important;        overflow-wrap: break-word !important;    }        /* Wrap in classic notebook */    .input_area {        white-space: pre-wrap !important;        word-wrap: break-word !important;    }    .CodeMirror-lines {        white-space: pre-wrap !important;        word-wrap: break-word !important;    }</style>"""))print("‚úÖ Text wrapping enabled for notebook output and code cells")

In [4]:
# Install missing dependenciesimport sysimport subprocessimport importlibdef install_package(package):    # Try --user first, fallback to --break-system-packages if needed    try:        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '--user', '--quiet'],                              stderr=subprocess.DEVNULL)    except subprocess.CalledProcessError:        # Fallback for environments that don't support --user        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package, '--quiet', '--break-system-packages'],                             stderr=subprocess.DEVNULL)# Install and import pgvectortry:    import pgvector    print('‚úÖ pgvector already installed')except ImportError:    print('‚ö†Ô∏è  Installing pgvector...')    install_package('pgvector')    # Invalidate caches and re-import    importlib.invalidate_caches()    import pgvector    print('‚úÖ pgvector installed and imported')# Install and import sqlalchemytry:    import sqlalchemy    print('‚úÖ sqlalchemy already installed')except ImportError:    print('‚ö†Ô∏è  Installing sqlalchemy...')    install_package('sqlalchemy')    importlib.invalidate_caches()    import sqlalchemy    print('‚úÖ sqlalchemy installed and imported')# Install and import sentence_transformerstry:    import sentence_transformers    print('‚úÖ sentence_transformers already installed')except ImportError:    print('‚ö†Ô∏è  Installing sentence_transformers...')    install_package('sentence_transformers')    importlib.invalidate_caches()    import sentence_transformers    print('‚úÖ sentence_transformers installed and imported')# Final cache invalidation to ensure all imports workimportlib.invalidate_caches()print('‚úÖ Import caches invalidated')

In [13]:
# Setup and imports
import sys
import os
from pathlib import Path
import json
import asyncio
from typing import Dict, Any, Optional
from datetime import datetime
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Try to load .env file if python-dotenv is available
try:
    from dotenv import load_dotenv
    env_path = Path('.env')
    if env_path.exists():
        load_dotenv(env_path)
        print("‚úÖ Loaded environment variables from .env")
    else:
        print("‚ö†Ô∏è  No .env file found, using system environment variables")
except ImportError:
    print("‚ö†Ô∏è  python-dotenv not installed, using system environment variables only")
    print("   Install with: pip install python-dotenv")

# Set default LMStudio model values if not already set
# These can be overridden by setting environment variables
if not os.getenv('LMSTUDIO_MODEL_RANK'):
    default_model = os.getenv('LMSTUDIO_MODEL', 'mistralai/mistral-7b-instruct-v0.3')
    os.environ['LMSTUDIO_MODEL_RANK'] = default_model
    print(f"‚ö†Ô∏è  LMSTUDIO_MODEL_RANK not set, using: {os.environ['LMSTUDIO_MODEL_RANK']}")

if not os.getenv('LMSTUDIO_MODEL_EXTRACT'):
    default_model = os.getenv('LMSTUDIO_MODEL', 'mistralai/mistral-7b-instruct-v0.3')
    os.environ['LMSTUDIO_MODEL_EXTRACT'] = default_model
    
if not os.getenv('LMSTUDIO_MODEL_SIGMA'):
    default_model = os.getenv('LMSTUDIO_MODEL', 'mistralai/mistral-7b-instruct-v0.3')
    os.environ['LMSTUDIO_MODEL_SIGMA'] = default_model

if not os.getenv('LMSTUDIO_API_URL'):
    os.environ['LMSTUDIO_API_URL'] = 'http://localhost:1234/v1'
    print(f"‚ö†Ô∏è  LMSTUDIO_API_URL not set, using: {os.environ['LMSTUDIO_API_URL']}")

# Add project root to path
project_root = Path.cwd()
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Database and workflow imports
from src.database.async_manager import AsyncDatabaseManager
from src.database.models import ArticleTable
from src.utils.content_filter import ContentFilter
from src.services.llm_service import LLMService
from src.services.sigma_generation_service import SigmaGenerationService
# Install sentence_transformers if needed
try:
    import sentence_transformers
    print('‚úÖ sentence_transformers already installed')
except ImportError:
    import sys
    import subprocess
    print('‚ö†Ô∏è Installing sentence_transformers...')
    try:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sentence_transformers', '--user', '--quiet'], stderr=subprocess.DEVNULL)
    except subprocess.CalledProcessError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sentence_transformers', '--quiet', '--break-system-packages'], stderr=subprocess.DEVNULL)
    import importlib
    importlib.invalidate_caches()
    import sentence_transformers
    print('‚úÖ sentence_transformers installed')

from src.services.rag_service import RAGService

# Initialize services
async_db_manager = AsyncDatabaseManager()
llm_service = LLMService()
content_filter = ContentFilter()
sigma_service = SigmaGenerationService()
rag_service = RAGService()

print("‚úÖ Services initialized")
print(f"   Ranking Model: {llm_service.model_rank}")
print(f"   Extraction Model: {llm_service.model_extract}")
print(f"   SIGMA Model: {llm_service.model_sigma}")

True

‚úÖ Loaded environment variables from .env
‚úÖ sentence_transformers already installed
‚úÖ Services initialized
   Ranking Model: deepseek/deepseek-r1-0528-qwen3-8b
   Extraction Model: deepseek/deepseek-r1-0528-qwen3-8b
   SIGMA Model: deepseek/deepseek-r1-0528-qwen3-8b


In [14]:
# Configuration - Edit these values
ARTICLE_ID = 2042  # Change this to test different articles
JUNK_FILTER_THRESHOLD = 0.8  # Confidence threshold for junk filter
RANKING_THRESHOLD = 6.0  # Minimum ranking score to continue

# Model selection
RANKING_MODEL = 'lmstudio'  # Model for ranking
EXTRACTION_MODEL = 'lmstudio'  # Model for extraction
SIGMA_MODEL = 'lmstudio'  # Model for SIGMA generation

print(f"üìã Configuration:")
print(f"   Article ID: {ARTICLE_ID}")
print(f"   Junk Filter Threshold: {JUNK_FILTER_THRESHOLD}")
print(f"   Ranking Threshold: {RANKING_THRESHOLD}")

üìã Configuration:
   Article ID: 2042
   Junk Filter Threshold: 0.8
   Ranking Threshold: 6.0


In [15]:
# Load article
async def load_article(article_id: int):
    """Load article from database."""
    article = await async_db_manager.get_article(article_id)
    if not article:
        raise ValueError(f"Article {article_id} not found")
    
    source = await async_db_manager.get_source(article.source_id)
    
    return {
        'id': article.id,
        'title': article.title,
        'content': article.content,
        'source_id': article.source_id,
        'source_name': source.name if source else 'Unknown',
        'url': article.canonical_url or '',
        'published_at': article.published_at,
        'metadata': article.article_metadata
    }

article = await load_article(ARTICLE_ID)
print(f"‚úÖ Loaded Article {ARTICLE_ID}: {article['title'][:80]}...")
print(f"   Content length: {len(article['content'])} characters")
print(f"   Source: {article['source_name']}")

Database session error: [Errno 8] nodename nor servname provided, or not known
Traceback (most recent call last):
  File "/Users/starlord/CTIScraper/src/database/async_manager.py", line 85, in get_session
    yield session
  File "/Users/starlord/CTIScraper/src/database/async_manager.py", line 709, in get_article
    result = await session.execute(
             ^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sqlalchemy/ext/asyncio/session.py", line 454, in execute
    result = await greenlet_spawn(
             ^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sqlalchemy/util/_concurrency_py3k.py", line 190, in greenlet_spawn
    result = context.throw(*sys.exc_info())
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sqlalchemy/orm/session.py", line 2262, in execute
    return self._

ValueError: Article 2042 not found

## Step 0: Junk Filter

Filters content using ContentFilter to remove junk/non-huntable content.

In [None]:
# Step 0: Junk Filter
async def test_junk_filter(content: str, threshold: float = 0.8, article_id: int = None):
    """Test junk filter step."""
    print("üîç Running Junk Filter...")
    
    hunt_score = article.get('metadata', {}).get('threat_hunting_score', 0) if article else 0
    
    filter_result = content_filter.filter_content(
        content,
        min_confidence=threshold,
        hunt_score=hunt_score,
        article_id=article_id or ARTICLE_ID
    )
    
    result = {
        'filtered': filter_result.is_huntable,
        'confidence': filter_result.confidence,
        'original_length': len(content),
        'filtered_length': len(filter_result.filtered_content) if filter_result.filtered_content else 0,
        'filtered_content': filter_result.filtered_content or content,
        'chunks_removed': len(filter_result.removed_chunks) if filter_result.removed_chunks else 0
    }
    
    print(f"‚úÖ Junk Filter Result:")
    print(f"   Filtered (is_huntable): {result['filtered']}")
    print(f"   Confidence: {result['confidence']:.3f}")
    print(f"   Original length: {result['original_length']:,} chars")
    print(f"   Filtered length: {result['filtered_length']:,} chars")
    print(f"   Chunks removed: {result['chunks_removed']}")
    
    return result

# Run junk filter
junk_filter_result = await test_junk_filter(
    article['content'],
    threshold=JUNK_FILTER_THRESHOLD,
    article_id=ARTICLE_ID
)

# Store filtered content for next steps
filtered_content = junk_filter_result['filtered_content']

## Step 1: LLM Rank Article

Ranks the article for huntability. **Edit the prompt below** to customize ranking behavior.

In [None]:
# Editable Ranking Prompt
RANKING_PROMPT = """As a cybersecurity expert specializing in threat hunting and detection engineering, analyze this threat intelligence article for its usefulness to security professionals.

**Article Title:** {title}
**Source:** {source}
**URL:** {url}
**Content Length:** {content_length} characters

**Analysis Criteria:**
1. **Technical Depth:** Does the article provide specific technical details, commands, or procedures?
2. **Actionable Intelligence:** Can security teams immediately act on this information?
3. **Detection Potential:** Does it contain indicators or behaviors that can be detected?
4. **Threat Hunting Value:** Is this useful for proactive threat hunting activities?
5. **Operational Impact:** How relevant is this for day-to-day security operations?

**Scoring Guidelines:**
- **9-10:** Excellent - Highly actionable, specific technical details, immediate operational value
- **7-8:** Good - Useful information with some technical specifics
- **5-6:** Moderate - Some value but limited technical depth
- **3-4:** Limited - Minimal actionable intelligence
- **1-2:** Poor - Mostly strategic/general information with little operational value

**Output Format:**
**HUNTABILITY SCORE: [1-10]**

**KEY FINDINGS:**
[List the most important technical details, commands, or indicators]

**ACTIONABLE INTELLIGENCE:**
[Specific steps security teams can take]

**DETECTION OPPORTUNITIES:**
[What can be monitored or detected]

**THREAT HUNTING VALUE:**
[How this supports proactive hunting activities]

**OPERATIONAL RECOMMENDATIONS:**
[Specific actions for security teams]

Please analyze the following content:

{content}"""

print("üìù Current Ranking Prompt:")
print(RANKING_PROMPT[:200] + "...")

In [None]:
# Step 1: LLM Rank Article
async def test_rank_article(
    title: str,
    content: str,
    source: str,
    url: str,
    prompt_template: str = None
):
    """Test ranking step with custom prompt."""
    print("üìä Running LLM Ranking...")
    
    # Use custom prompt if provided, otherwise use default
    if prompt_template:
        # Save to temp file for LLM service
        import tempfile
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
            f.write(prompt_template)
            temp_path = f.name
        
        ranking_result = await llm_service.rank_article(
            title=title,
            content=content,
            source=source,
            url=url,
            prompt_template_path=temp_path
        )
        
        # Clean up temp file
        os.unlink(temp_path)
    else:
        ranking_result = await llm_service.rank_article(
            title=title,
            content=content,
            source=source,
            url=url
        )
    
    score = ranking_result['score']
    reasoning = ranking_result.get('reasoning', 'No reasoning provided')
    should_continue = score >= RANKING_THRESHOLD
    
    print(f"‚úÖ Ranking Result:")
    print(f"   Score: {score:.1f}/10")
    print(f"   Threshold: {RANKING_THRESHOLD}/10")
    print(f"   Should Continue: {should_continue}")
    print(f"\n   Reasoning:\n{reasoning}")
    
    return {
        'score': score,
        'reasoning': reasoning,
        'should_continue': should_continue
    }

# Run ranking
ranking_result = await test_rank_article(
    title=article['title'],
    content=filtered_content,
    source=article['source_name'],
    url=article['url'],
    prompt_template=RANKING_PROMPT
)

## Step 2: Extract Agent

Extracts behavioral observables and IOCs. **Edit the prompts below** to customize extraction behavior.

In [None]:
# Editable ExtractAgent Prompt (JSON config)
EXTRACT_AGENT_CONFIG = {
    "role": "You are a detection engineer LLM. Your task is to extract telemetry-aware attacker techniques and observables that are useful to detection engineers and threat hunters.",
    "objective": "Extract telemetry-based observables (command-line executions, process chains, service/registry modifications, file path usage, event log manipulation). Output unique and discrete entries only.",
    "exclusions": {
        "do_not_extract": [
            "Atomic IOCs like single IP addresses, domains, or file hashes",
            "One-off URLs or email addresses without recognizable structure or patterns"
        ],
        "do_extract": [
            "Command-line executions (especially chained or obfuscated)",
            "Parent ‚Üí child process chains",
            "Registry key/value modification patterns",
            "Service manipulation (creation, deletion, status change)",
            "Suspicious file paths or locations (Temp dirs, uncommon drive paths)",
            "Event log deletion or manipulation",
            "Encoded or obfuscated values"
        ]
    },
    "output_format": {
        "behavioral_observables": "Array of unique observables with tags (e.g., process_cmdline, registry_pattern, service_command)",
        "observable_list": "Array of raw observable strings (deduplicated, plaintext)",
        "detection_queries": "Optional array of KQL/Sigma-like query fragments, if evident",
        "url": "Source URL of the original content",
        "content": "Concise extracted raw text that includes only the observables (e.g., attacker commands, registry paths, etc.)",
        "discrete_huntables_count": "Integer value representing the number of unique discrete observables extracted"
    },
    "platform_coverage": {
        "valid_sources": [
            "Windows: Sysmon, Security Logs",
            "Linux: auditd, Syslog",
            "macOS: EndpointSecurity, Unified Logs",
            "Cloud: AWS CloudTrail, Azure Activity Logs, GCP Audit Logs"
        ]
    },
    "instructions": "Read the threat report. Extract **only** telemetry-relevant attacker behaviors and observables that can be captured by EDR/logs. All observables must be unique. Remove any duplicates. Use exact strings or <placeholder> where appropriate. Output must be a **valid JSON object only** (no markdown, no explanations)."
}

# Editable ExtractAgent Instructions Template
EXTRACT_AGENT_INSTRUCTIONS = """Title: {title}

URL: {url}

Content:

{content}

Extract telemetry-aware attacker behaviors and observables.

{prompt_config}

CRITICAL: Output your response as a valid JSON object only. Begin with {{{{ and end with }}}}. Do not include reasoning, explanations, or markdown outside the JSON object."""

print("üìù ExtractAgent Config:")
print(json.dumps(EXTRACT_AGENT_CONFIG, indent=2))

In [None]:
# Step 2: Extract Agent
async def test_extract_agent(
    title: str,
    content: str,
    url: str,
    extract_config: dict = None,
    instructions_template: str = None
):
    """Test extraction step with custom prompts."""
    print("üî¨ Running Extract Agent...")
    
    # Create temp prompt files
    import tempfile
    
    # Save config to temp file
    with tempfile.NamedTemporaryFile(mode='w', suffix='_ExtractAgent', delete=False) as f:
        json.dump(extract_config or EXTRACT_AGENT_CONFIG, f, indent=2)
        config_path = f.name
    
    # Save instructions template
    with tempfile.NamedTemporaryFile(mode='w', suffix='_ExtractAgentInstructions.txt', delete=False) as f:
        f.write(instructions_template or EXTRACT_AGENT_INSTRUCTIONS)
        instructions_path = f.name
    
    try:
        extraction_result = await llm_service.extract_behaviors(
            content=content,
            title=title,
            url=url,
            prompt_file_path=config_path
        )
        
        print(f"‚úÖ Extraction Result:")
        print(f"   Discrete Huntables: {extraction_result.get('discrete_huntables_count', 0)}")
        
        # Display observable_list (like curl command output)
        observable_list = extraction_result.get('observable_list', [])
        if observable_list:
            print(f"\n   üìã Observable List ({len(observable_list)} items):")
            for i, obs in enumerate(observable_list[:50], 1):  # Show first 50
                print(f"      {i:2d}. {obs}")
            if len(observable_list) > 50:
                print(f"      ... and {len(observable_list) - 50} more")
        
        # Display behavioral observables
        behavioral_obs = extraction_result.get('behavioral_observables', [])
        if behavioral_obs:
            print(f"\n   üéØ Behavioral Observables ({len(behavioral_obs)} items):")
            for i, obs in enumerate(behavioral_obs[:20], 1):  # Show first 20
                print(f"      {i:2d}. {obs}")
            if len(behavioral_obs) > 20:
                print(f"      ... and {len(behavioral_obs) - 20} more")
        
        # Display detection queries
        detection_queries = extraction_result.get('detection_queries', [])
        if detection_queries:
            print(f"\n   üîç Detection Queries ({len(detection_queries)} items):")
            for i, query in enumerate(detection_queries[:10], 1):  # Show first 10
                print(f"      {i:2d}. {query}")
            if len(detection_queries) > 10:
                print(f"      ... and {len(detection_queries) - 10} more")
        
        return extraction_result
        
    finally:
        # Clean up temp files
        os.unlink(config_path)
        os.unlink(instructions_path)

# Run extraction
extraction_result = await test_extract_agent(
    title=article['title'],
    content=filtered_content,
    url=article['url'],
    extract_config=EXTRACT_AGENT_CONFIG,
    instructions_template=EXTRACT_AGENT_INSTRUCTIONS
)

## Step 3: Generate SIGMA Rules

Generates SIGMA detection rules. **Edit the prompt below** to customize SIGMA generation.

In [None]:
# Editable SIGMA Generation Prompt
SIGMA_GENERATION_PROMPT = """Generate a SIGMA detection rule in valid YAML format.

Article: {title}
Source: {source}

Content:
{content}

CRITICAL: Output ONLY valid YAML. No explanatory text. No markdown blocks.

Example format (copy this structure exactly):

title: Suspicious PowerShell Execution
id: 12345678-1234-1234-1234-123456789abc
description: Detects suspicious PowerShell commands
logsource:
  category: process_creation
  product: windows
detection:
  selection:
    Image|endswith: '\\powershell.exe'
    CommandLine|contains: 'bypass'
  condition: selection
level: high
status: experimental
tags:
  - attack.execution
  - attack.t1059.001
references:
  - {url}

IMPORTANT FORMATTING RULES:
1. logsource MUST be indented with 2 spaces under the key
2. detection MUST be indented with 2 spaces under the key
3. tags MUST be a list with "- " prefix
4. Use lowercase for all field names
5. Start output with "title:" - no text before it"""

print("üìù SIGMA Generation Prompt:")
print(SIGMA_GENERATION_PROMPT[:200] + "...")

In [None]:
# Step 3: Generate SIGMA Rules
async def test_generate_sigma(
    title: str,
    content: str,
    source: str,
    url: str,
    custom_prompt: str = None
):
    """Test SIGMA generation with custom prompt."""
    print("‚ö° Running SIGMA Generation...")
    
    # Note: SigmaGenerationService uses prompt_loader which loads from src/prompts/
    # For custom prompts, we'd need to modify the service or temporarily replace the prompt file
    
    generation_result = await sigma_service.generate_sigma_rules(
        article_title=title,
        article_content=content,
        source_name=source,
        url=url,
        ai_model=SIGMA_MODEL,
        max_attempts=3,
        min_confidence=0.9
    )
    
    rules = generation_result.get('rules', [])
    errors = generation_result.get('errors', [])
    
    print(f"‚úÖ SIGMA Generation Result:")
    print(f"   Rules Generated: {len(rules)}")
    
    if rules:
        print(f"\n   üìã SIGMA Rules:")
        for i, rule in enumerate(rules[:5], 1):  # Show first 5
            rule_title = rule.get('title', 'Untitled')
            rule_id = rule.get('id', 'No ID')
            print(f"      {i}. {rule_title} (ID: {rule_id})")
        if len(rules) > 5:
            print(f"      ... and {len(rules) - 5} more")
    
    if errors:
        print(f"\n   ‚ö†Ô∏è  Errors:")
        for error in errors[:3]:
            print(f"      - {error}")
    
    return generation_result

# Run SIGMA generation
sigma_result = await test_generate_sigma(
    title=article['title'],
    content=filtered_content,
    source=article['source_name'],
    url=article['url']
)

## Step 4: Similarity Search

Searches for similar existing SIGMA rules in the repository.

In [None]:
# Step 4: Similarity Search
async def test_similarity_search(
    sigma_rules: list,
    max_results: int = 10
):
    """Test similarity search for generated SIGMA rules."""
    print("üîé Running Similarity Search...")
    
    if not sigma_rules:
        print("‚ö†Ô∏è  No SIGMA rules to search")
        return []
    
    all_results = []
    
    for rule in sigma_rules:
        rule_title = rule.get('title', 'Untitled')
        rule_description = rule.get('description', '')
        
        # Create query from rule title and description
        query = f"{rule_title} {rule_description}".strip()
        
        # Search for similar rules
        try:
            search_results = await rag_service.search_similar_sigma_rules(
                query=query,
                limit=max_results
            )
            
            all_results.append({
                'rule_title': rule_title,
                'similar_rules': search_results
            })
            
            print(f"\n   Rule: {rule_title}")
            print(f"   Similar rules found: {len(search_results)}")
            
            if search_results:
                max_sim = max((r.get('similarity', 0) for r in search_results), default=0)
                print(f"   Max similarity: {max_sim:.3f}")
                
                # Show top 3 results
                for i, result in enumerate(search_results[:3], 1):
                    sim = result.get('similarity', 0)
                    title = result.get('title', 'Unknown')
                    print(f"      {i}. {title} (similarity: {sim:.3f})")
        
        except Exception as e:
            print(f"   ‚ö†Ô∏è  Error searching for '{rule_title}': {e}")
    
    return all_results

# Run similarity search
sigma_rules = sigma_result.get('rules', []) if 'sigma_result' in locals() else []
similarity_results = await test_similarity_search(sigma_rules, max_results=10)

## Step 5: Promote to Queue

Promotes rules to queue if similarity is low (rules are unique enough).

In [None]:
# Step 5: Promote to Queue
def test_promote_to_queue(
    sigma_rules: list,
    similarity_results: list,
    max_similarity_threshold: float = 0.7
):
    """Test queue promotion logic."""
    print("üì§ Running Queue Promotion Logic...")
    
    if not sigma_rules:
        print("‚ö†Ô∏è  No SIGMA rules to promote")
        return []
    
    queued_rules = []
    
    for i, rule in enumerate(sigma_rules):
        rule_title = rule.get('title', 'Untitled')
        
        # Get similarity results for this rule
        similarity_info = similarity_results[i] if i < len(similarity_results) else None
        
        if similarity_info and similarity_info.get('similar_rules'):
            max_sim = max((r.get('similarity', 0) for r in similarity_info['similar_rules']), default=0)
        else:
            max_sim = 0.0
        
        # Promote if similarity is below threshold
        if max_sim < max_similarity_threshold:
            queued_rules.append({
                'rule': rule,
                'max_similarity': max_sim,
                'reason': 'Low similarity to existing rules'
            })
            print(f"   ‚úÖ Queued: {rule_title} (max similarity: {max_sim:.3f})")
        else:
            print(f"   ‚ùå Skipped: {rule_title} (max similarity: {max_sim:.3f} >= {max_similarity_threshold})")
    
    print(f"\n‚úÖ Total rules queued: {len(queued_rules)}/{len(sigma_rules)}")
    
    return queued_rules

# Run queue promotion
queued_rules = test_promote_to_queue(
    sigma_rules,
    similarity_results,
    max_similarity_threshold=0.7
)

## Summary: Complete Workflow Results

Display comprehensive results similar to `trigger_workflow.py` output.

In [None]:
# Display comprehensive results (similar to trigger_workflow.py)
def display_workflow_results(
    article: dict,
    junk_filter_result: dict,
    ranking_result: dict,
    extraction_result: dict,
    sigma_result: dict,
    similarity_results: list,
    queued_rules: list
):
    """Display complete workflow results."""
    print("=" * 80)
    print("üìä COMPLETE WORKFLOW RESULTS")
    print("=" * 80)
    
    # Article info
    print(f"\nüì∞ Article:")
    print(f"   ID: {article['id']}")
    print(f"   Title: {article['title']}")
    print(f"   Source: {article['source_name']}")
    
    # Junk Filter
    print(f"\nüîç Junk Filter:")
    print(f"   Filtered: {junk_filter_result['filtered']}")
    print(f"   Confidence: {junk_filter_result['confidence']:.3f}")
    print(f"   Length: {junk_filter_result['original_length']:,} ‚Üí {junk_filter_result['filtered_length']:,} chars")
    
    # Ranking
    print(f"\nüìà Ranking:")
    print(f"   Score: {ranking_result['score']:.1f}/10")
    print(f"   Should Continue: {ranking_result['should_continue']}")
    
    # Extraction
    print(f"\nüî¨ Extraction:")
    print(f"   Discrete Huntables: {extraction_result.get('discrete_huntables_count', 0)}")
    
    observable_list = extraction_result.get('observable_list', [])
    if observable_list:
        print(f"\n   üìã Observable List ({len(observable_list)} items):")
        for i, obs in enumerate(observable_list[:100], 1):  # Show first 100
            print(f"      {i:3d}. {obs}")
        if len(observable_list) > 100:
            print(f"      ... and {len(observable_list) - 100} more")
    
    behavioral_obs = extraction_result.get('behavioral_observables', [])
    if behavioral_obs:
        print(f"\n   üéØ Behavioral Observables ({len(behavioral_obs)} items):")
        for i, obs in enumerate(behavioral_obs[:50], 1):  # Show first 50
            print(f"      {i:2d}. {obs}")
        if len(behavioral_obs) > 50:
            print(f"      ... and {len(behavioral_obs) - 50} more")
    
    # SIGMA Rules
    sigma_rules = sigma_result.get('rules', [])
    print(f"\n‚ö° SIGMA Rules Generated: {len(sigma_rules)}")
    for i, rule in enumerate(sigma_rules[:5], 1):
        title = rule.get('title', 'Untitled')
        print(f"   {i}. {title}")
    if len(sigma_rules) > 5:
        print(f"   ... and {len(sigma_rules) - 5} more")
    
    # Similarity Search
    if similarity_results:
        print(f"\nüîé Similarity Search:")
        total_similar = sum(len(r.get('similar_rules', [])) for r in similarity_results)
        print(f"   Total Similar Rules Found: {total_similar}")
        if similarity_results:
            max_sims = []
            for result in similarity_results:
                similar_rules = result.get('similar_rules', [])
                if similar_rules:
                    max_sims.append(max((r.get('similarity', 0) for r in similar_rules), default=0))
            if max_sims:
                print(f"   Max Similarity: {max(max_sims):.3f}")
    
    # Queue Promotion
    print(f"\nüì§ Queue Promotion:")
    print(f"   Rules Queued: {len(queued_rules)}")
    
    print("\n" + "=" * 80)

# Display results
if 'junk_filter_result' in locals() and 'ranking_result' in locals() and 'extraction_result' in locals():
    display_workflow_results(
        article,
        junk_filter_result,
        ranking_result,
        extraction_result,
        sigma_result if 'sigma_result' in locals() else {'rules': []},
        similarity_results if 'similarity_results' in locals() else [],
        queued_rules if 'queued_rules' in locals() else []
    )

## Export Results

Export results to JSON for further analysis.

In [None]:
# Export results to JSON
def export_results(
    article: dict,
    junk_filter_result: dict,
    ranking_result: dict,
    extraction_result: dict,
    sigma_result: dict,
    similarity_results: list,
    queued_rules: list,
    filename: str = None
):
    """Export workflow results to JSON file."""
    
    results = {
        'article_id': article['id'],
        'article_title': article['title'],
        'timestamp': datetime.utcnow().isoformat(),
        'junk_filter_result': junk_filter_result,
        'ranking_result': ranking_result,
        'extraction_result': extraction_result,
        'sigma_result': sigma_result,
        'similarity_results': similarity_results,
        'queued_rules': queued_rules
    }
    
    if filename is None:
        filename = f"workflow_results_{article['id']}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2, default=str)
    
    print(f"‚úÖ Results exported to: {filename}")
    return filename

# Export if all results are available
if all(var in locals() for var in ['junk_filter_result', 'ranking_result', 'extraction_result']):
    export_filename = export_results(
        article,
        junk_filter_result,
        ranking_result,
        extraction_result,
        sigma_result if 'sigma_result' in locals() else {'rules': [], 'errors': []},
        similarity_results if 'similarity_results' in locals() else [],
        queued_rules if 'queued_rules' in locals() else []
    )