# CommandLine Observables Testing

Test different models (LLMs and embedding models) for counting CommandLine observables in CTI articles.

**Scope**: Only CommandLinePatterns category

**Model Types**:
- LLM models: LMStudio, Anthropic, OpenAI
- Embedding models: CTI-BERT (pattern-based extraction with semantic validation)

## 1. Setup and Imports

In [None]:
import sys
import os
import json
import asyncio
import re
from pathlib import Path
from typing import Dict, List, Any, Optional
from datetime import datetime
import pandas as pd
import httpx

# Set project root - use absolute path
# Update this path if your project is located elsewhere
PROJECT_ROOT = Path('/Users/starlord/CTIScraper')

# Alternative: Auto-detect from current notebook location
# This assumes notebook is in notebooks/ subdirectory
if not PROJECT_ROOT.exists():
    # Try to find project root by going up from notebooks/
    current = Path.cwd()
    if current.name == 'notebooks':
        PROJECT_ROOT = current.parent
    else:
        # Search up for src/database
        search_path = current
        for _ in range(5):
            if (search_path / 'src' / 'database').exists():
                PROJECT_ROOT = search_path
                break
            search_path = search_path.parent
        else:
            PROJECT_ROOT = Path('/Users/starlord/CTIScraper')  # Fallback

# Change to project root directory
os.chdir(PROJECT_ROOT)

# Add to Python path
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Now import
from src.database.manager import DatabaseManager
from src.database.models import ArticleTable
from src.utils.content_filter import ContentFilter
from src.services.llm_service import LLMService

print(f"✅ Successfully imported modules")
print(f"Project root: {PROJECT_ROOT}")
print(f"Working directory: {os.getcwd()}")
print(f"src exists: {(PROJECT_ROOT / 'src').exists()}")


In [None]:
# Verify the setup worked
import sys
from pathlib import Path

print(f"Current working directory: {Path.cwd()}")
print(f"Project root: {PROJECT_ROOT}")
print(f"sys.path[0]: {sys.path[0] if sys.path else 'empty'}")
print(f"src exists: {(PROJECT_ROOT / 'src').exists()}")
print(f"src/database exists: {(PROJECT_ROOT / 'src' / 'database').exists()}")

# Try importing again to verify
try:
    from src.database.manager import DatabaseManager
    print("✅ DatabaseManager imported successfully")
except ImportError as e:
    print(f"❌ Import failed: {e}")
    print("\nTroubleshooting:")
    print(f"  - Check if {PROJECT_ROOT / 'src'} exists")
    print(f"  - Check if {PROJECT_ROOT / 'src' / 'database'} exists")
    print(f"  - Current working directory: {Path.cwd()}")
    print(f"  - Try running: cd {PROJECT_ROOT} && jupyter notebook")

## 2. Configuration

In [None]:
# LLM Model configurations
LLM_MODELS = {
    # LMStudio models
    'deepseek-r1-qwen3-8b': {
        'model_name': 'deepseek/deepseek-r1-0528-qwen3-8b',
        'provider': 'lmstudio',
        'description': 'DeepSeek R1 Qwen3 8B (reasoning)'
    },
    'mistral-7b': {
        'model_name': 'mistralai/mistral-7b-instruct-v0.3',
        'provider': 'lmstudio',
        'description': 'Mistral 7B Instruct'
    },
    'qwen2-7b': {
        'model_name': 'qwen2-7b-instruct',
        'provider': 'lmstudio',
        'description': 'Qwen2 7B Instruct'
    },
    'llama-3.1-8b': {
        'model_name': 'meta-llama-3.1-8b-instruct',
        'provider': 'lmstudio',
        'description': 'Llama 3.1 8B Instruct'
    },
    'granite-4-h-tiny': {
        'model_name': 'bm/granite-4-h-tiny',
        'provider': 'lmstudio',
        'description': 'Granite 4H Tiny'
    },
    
    # Anthropic models
    'claude-sonnet-4-5': {
        'model_name': 'claude-sonnet-4-5',
        'provider': 'anthropic',
        'description': 'Claude Sonnet 4.5'
    },
    'claude-haiku-4-5': {
        'model_name': 'claude-haiku-4-5',
        'provider': 'anthropic',
        'description': 'Claude Haiku 4.5'
    },
    
    # OpenAI models
    'gpt-4o-mini': {
        'model_name': 'gpt-4o-mini',
        'provider': 'openai',
        'description': 'GPT-4o Mini'
    },
    'gpt-5-mini': {
        'model_name': 'gpt-5-mini',
        'provider': 'openai',
        'description': 'GPT-5 Mini'
    },
    'gpt-5.1': {
        'model_name': 'gpt-5.1',
        'provider': 'openai',
        'description': 'GPT-5.1'
    }
}

# Embedding model configurations
EMBEDDING_MODELS = {
    'cti-bert': {
        'model_name': 'ibm-research/CTI-BERT',
        'description': 'CTI-BERT (pattern + embedding validation)'
    }
}

# Test parameters
TEMPERATURE = 0.0
SEED = 42
JUNK_FILTER_THRESHOLD = 0.8

print(f"Available LLM models: {len(LLM_MODELS)}")
print(f"Available embedding models: {len(EMBEDDING_MODELS)}")

## 3. Select Articles to Test

In [None]:
# Option 1: Select by article IDs
ARTICLE_IDS = [1937, 1909, 1866, 1860, 1794]  # Modify this list

# Option 2: Select by URLs (uncomment and modify)
# ARTICLE_URLS = [
#     "https://thedfirreport.com/2025/08/05/from-bing-search-to-ransomware-bumblebee-and-adaptixc2-deliver-akira/",
#     # Add more URLs here
# ]

# Load articles from database
db_manager = DatabaseManager()
db_session = db_manager.get_session()

articles = []
try:
    if 'ARTICLE_IDS' in locals() and ARTICLE_IDS:
        for article_id in ARTICLE_IDS:
            article = db_session.query(ArticleTable).filter(ArticleTable.id == article_id).first()
            if article:
                articles.append({
                    'id': article.id,
                    'title': article.title,
                    'url': article.canonical_url,
                    'content': article.content or ""
                })
    elif 'ARTICLE_URLS' in locals() and ARTICLE_URLS:
        for url in ARTICLE_URLS:
            article = db_session.query(ArticleTable).filter(ArticleTable.canonical_url == url).first()
            if article:
                articles.append({
                    'id': article.id,
                    'title': article.title,
                    'url': article.canonical_url,
                    'content': article.content or ""
                })
finally:
    db_session.close()

print(f"Loaded {len(articles)} articles:")
for article in articles:
    print(f"  [{article['id']}] {article['title'][:60]}...")

## 4. Select Models to Test

In [None]:
# Select which LLM models to test (modify this list)
LLM_MODELS_TO_TEST = [
    'gpt-4o-mini',
    'claude-sonnet-4-5',
    'deepseek-r1-qwen3-8b',
    'mistral-7b',
    # Add more model keys from LLM_MODELS dict above
]

# Select which embedding models to test (modify this list)
EMBEDDING_MODELS_TO_TEST = [
    'cti-bert',
    # Add more embedding models if available
]

# Validate models
invalid_llm = [m for m in LLM_MODELS_TO_TEST if m not in LLM_MODELS]
invalid_embedding = [m for m in EMBEDDING_MODELS_TO_TEST if m not in EMBEDDING_MODELS]

if invalid_llm:
    print(f"⚠️  Invalid LLM models: {invalid_llm}")
    print(f"Available LLM models: {list(LLM_MODELS.keys())}")
if invalid_embedding:
    print(f"⚠️  Invalid embedding models: {invalid_embedding}")
    print(f"Available embedding models: {list(EMBEDDING_MODELS.keys())}")

if not invalid_llm and not invalid_embedding:
    print(f"✅ Testing {len(LLM_MODELS_TO_TEST)} LLM models and {len(EMBEDDING_MODELS_TO_TEST)} embedding models:")
    for model_key in LLM_MODELS_TO_TEST:
        config = LLM_MODELS[model_key]
        print(f"  LLM: {model_key} - {config['description']} ({config['provider']})")
    for model_key in EMBEDDING_MODELS_TO_TEST:
        config = EMBEDDING_MODELS[model_key]
        print(f"  Embedding: {model_key} - {config['description']}")

## 5. Load Functions

The functions are available in `scripts/test_commandline_observables_standalone.py`.
You can either:
1. Import them from the script
2. Copy the function code into cells below

In [None]:
# Option 1: Import from standalone script
# Uncomment to use:
# import sys
# sys.path.insert(0, str(PROJECT_ROOT / 'scripts'))
# from test_commandline_observables_standalone import (
#     count_commandline_with_llm,
#     count_commandline_with_ctibert
# )

# Option 2: Copy functions from standalone script into cells below
print("Functions should be defined in cells below or imported from standalone script")

## 6. LLM-Based CommandLine Counting Function

Copy the `count_commandline_with_llm` function from `scripts/test_commandline_observables_standalone.py` into this cell.

In [None]:
async def count_commandline_with_llm(
    article_content: str,
    model_key: str,
    temperature: float = 0.0,
    seed: int = 42,
    junk_filter_threshold: float = 0.8,
    article_id: Optional[int] = None
) -> Dict[str, Any]:
    """Count CommandLine observables using specified LLM model."""
    
    model_config = LLM_MODELS[model_key]
    model_name = model_config['model_name']
    provider = model_config['provider']
    
    # Apply junk filter
    content_filter = ContentFilter()
    hunt_score = 0
    
    filter_result = content_filter.filter_content(
        article_content,
        min_confidence=junk_filter_threshold,
        hunt_score=hunt_score,
        article_id=article_id
    )
    filtered_content = filter_result.filtered_content or article_content
    
    # Build focused prompt for CommandLine only
    system_content = """You are an expert at identifying command-line observables in threat intelligence articles.

Your task: Count ONLY command-line patterns (executed command strings, flags, parameters) in the provided article.

Command-line patterns include:
- Executed command strings (e.g., "powershell.exe -enc", "cmd /c whoami")
- Command flags and parameters
- Script execution commands
- Shell commands

Do NOT count:
- Process names alone (without command-line)
- File paths (unless part of a command)
- Registry keys
- Network connections
- Other observables

Output ONLY valid JSON in this exact format:
{"CommandLinePatterns": <integer>, "Total": <integer>}

CRITICAL: Output ONLY the JSON object. No markdown, no explanations, no other text."""
    
    user_content = f"Article:\n\n{filtered_content}\n\nCount command-line observables and output JSON."
    
    # Call appropriate API based on provider
    response_text = ''
    reasoning_content = ''
    usage_info = {}
    error = None
    
    try:
        if provider == 'openai':
            openai_api_key = os.getenv("OPENAI_API_KEY")
            if not openai_api_key:
                return {'error': 'OpenAI API key not configured', 'count': None, 'parse_success': False}
            
            is_gpt5 = model_name.startswith('gpt-5')
            token_param = "max_completion_tokens" if is_gpt5 else "max_tokens"
            
            async with httpx.AsyncClient(timeout=300.0) as client:
                payload = {
                    "model": model_name,
                    token_param: 2000,
                    "messages": [
                        {"role": "system", "content": system_content},
                        {"role": "user", "content": user_content}
                    ]
                }
                if model_name != 'gpt-5-mini':
                    payload["temperature"] = temperature
                if seed is not None:
                    payload["seed"] = seed
                
                response = await client.post(
                    "https://api.openai.com/v1/chat/completions",
                    headers={
                        "Authorization": f"Bearer {openai_api_key}",
                        "Content-Type": "application/json"
                    },
                    json=payload
                )
                
                if response.status_code != 200:
                    error_detail = response.json().get('error', {}).get('message', response.text)
                    return {'error': f"OpenAI API error: {error_detail}", 'count': None, 'parse_success': False}
                
                result = response.json()
                response_text = result.get('choices', [{}])[0].get('message', {}).get('content', '')
                usage_info = result.get('usage', {})
                
        elif provider == 'anthropic':
            anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
            if not anthropic_api_key:
                return {'error': 'Anthropic API key not configured', 'count': None, 'parse_success': False}
            
            async with httpx.AsyncClient(timeout=300.0) as client:
                response = await client.post(
                    "https://api.anthropic.com/v1/messages",
                    headers={
                        "x-api-key": anthropic_api_key,
                        "Content-Type": "application/json",
                        "anthropic-version": "2023-06-01"
                    },
                    json={
                        "model": model_name,
                        "max_tokens": 2000,
                        "temperature": temperature,
                        "system": system_content,
                        "messages": [
                            {"role": "user", "content": user_content}
                        ]
                    }
                )
                
                if response.status_code != 200:
                    return {'error': f"Anthropic API error: {response.text}", 'count': None, 'parse_success': False}
                
                result = response.json()
                response_text = result.get('content', [{}])[0].get('text', '')
                usage_info = result.get('usage', {})
                
        elif provider == 'lmstudio':
            original_extract = os.getenv("LMSTUDIO_MODEL_EXTRACT")
            
            try:
                os.environ["LMSTUDIO_MODEL_EXTRACT"] = model_name
                
                llm_service = LLMService()
                llm_service.temperature = temperature
                llm_service.seed = seed
                
                actual_model_name = llm_service.model_extract
                
                messages = [
                    {"role": "system", "content": system_content},
                    {"role": "user", "content": user_content}
                ]
                
                messages = llm_service._convert_messages_for_model(messages, actual_model_name)
                
                is_reasoning_model = 'r1' in actual_model_name.lower() or 'reasoning' in actual_model_name.lower()
                max_tokens = 2000 if is_reasoning_model else 1500
                
                payload = {
                    "model": actual_model_name,
                    "messages": messages,
                    "max_tokens": max_tokens,
                    "temperature": temperature,
                    "top_p": llm_service.top_p,
                }
                
                if seed is not None:
                    payload["seed"] = seed
                
                result = await llm_service._post_lmstudio_chat(
                    payload,
                    model_name=actual_model_name,
                    timeout=300.0,
                    failure_context="CommandLine counting"
                )
                
                choice = result.get('choices', [{}])[0]
                message = choice.get('message', {})
                reasoning_content = message.get('reasoning_content', '')
                response_text = message.get('content', '')
                usage_info = result.get('usage', {})
                
            finally:
                if original_extract:
                    os.environ["LMSTUDIO_MODEL_EXTRACT"] = original_extract
                elif "LMSTUDIO_MODEL_EXTRACT" in os.environ:
                    del os.environ["LMSTUDIO_MODEL_EXTRACT"]
        
        # Extract JSON from reasoning if needed
        if reasoning_content and not response_text:
            if '{' in reasoning_content:
                last_brace = reasoning_content.rfind('}')
                if last_brace > 0:
                    json_start = reasoning_content.rfind('{', 0, last_brace)
                    if json_start >= 0:
                        potential_json = reasoning_content[json_start:last_brace+1]
                        try:
                            test_parse = json.loads(potential_json)
                            if 'CommandLinePatterns' in test_parse:
                                response_text = potential_json
                        except:
                            pass
        
        # Parse JSON
        count = None
        parse_success = False
        
        if response_text:
            try:
                # Extract JSON from markdown if present
                json_text = response_text
                if "```json" in response_text:
                    json_start = response_text.find("```json") + 7
                    json_end = response_text.find("```", json_start)
                    json_text = response_text[json_start:json_end].strip()
                elif "```" in response_text:
                    json_start = response_text.find("```") + 3
                    json_end = response_text.find("```", json_start)
                    json_text = response_text[json_start:json_end].strip()
                else:
                    # Find first { and last }
                    first_brace = json_text.find('{')
                    last_brace = json_text.rfind('}')
                    if first_brace >= 0 and last_brace > first_brace:
                        json_text = json_text[first_brace:last_brace+1]
                
                parsed = json.loads(json_text)
                count = parsed.get('CommandLinePatterns', 0)
                if isinstance(count, str) and count.isdigit():
                    count = int(count)
                parse_success = True
                
            except json.JSONDecodeError as e:
                error = f"JSON parse error: {str(e)}"
            except Exception as e:
                error = f"Parse error: {str(e)}"
        
        return {
            'count': count,
            'parse_success': parse_success,
            'raw_response': response_text,
            'reasoning_content': reasoning_content if reasoning_content else None,
            'usage': usage_info,
            'error': error
        }
        
    except Exception as e:
        import traceback
        return {
            'error': str(e),
            'count': None,
            'parse_success': False,
            'traceback': traceback.format_exc()
        }

# ============================================================================
# CTI-BERT PATTERN-BASED COUNTING FUNCTION
# ============================================================================


## 7. CTI-BERT Pattern-Based CommandLine Counting Function

Copy the `count_commandline_with_ctibert` function from `scripts/test_commandline_observables_standalone.py` into this cell.

In [None]:
def count_commandline_with_ctibert(
    article_content: str,
    model_key: str = 'cti-bert',
    junk_filter_threshold: float = 0.8,
    article_id: Optional[int] = None,
    use_embedding_validation: bool = True
) -> Dict[str, Any]:
    """Count CommandLine observables using CTI-BERT pattern matching + optional embedding validation."""
    
    try:
        from src.utils.ctibert_ner_extractor import CTIBERTNERExtractor
        import torch
    except ImportError as e:
        return {
            'error': f'CTI-BERT import failed: {str(e)}',
            'count': None,
            'parse_success': False
        }
    
    # Apply junk filter
    content_filter = ContentFilter()
    hunt_score = 0
    
    filter_result = content_filter.filter_content(
        article_content,
        min_confidence=junk_filter_threshold,
        hunt_score=hunt_score,
        article_id=article_id
    )
    filtered_content = filter_result.filtered_content or article_content
    
    # Command-line pattern regexes
    command_patterns = [
        # PowerShell commands
        r'powershell\.exe\s+(?:-enc|-command|-c|-encodedcommand|-e|-executionpolicy|\s+[^\s]+)',
        r'pwsh\s+(?:-enc|-command|-c|-encodedcommand|-e|-executionpolicy|\s+[^\s]+)',
        
        # CMD commands
        r'cmd\.exe\s+/[cC]\s+[^\n]+',
        r'cmd\s+/[cC]\s+[^\n]+',
        
        # Bash/Shell commands
        r'bash\s+-c\s+[^\n]+',
        r'sh\s+-c\s+[^\n]+',
        
        # Generic executable with flags
        r'\b[a-zA-Z0-9_\-]+\.exe\s+(?:-[a-zA-Z]+|/[a-zA-Z]+|\s+[^\s]+){1,}',
        
        # Script execution
        r'python\s+[^\s]+\.py\s+[^\n]+',
        r'python3\s+[^\s]+\.py\s+[^\n]+',
        r'perl\s+[^\s]+\.pl\s+[^\n]+',
        r'ruby\s+[^\s]+\.rb\s+[^\n]+',
        
        # Common command patterns
        r'\b(?:wmic|reg|schtasks|sc|net|netsh|bcdedit|dism|sfc)\s+[^\n]+',
        
        # Base64 encoded commands
        r'powershell.*-enc\s+[A-Za-z0-9+/=]{20,}',
    ]
    
    # Find all matches
    all_matches = []
    for pattern in command_patterns:
        matches = re.finditer(pattern, filtered_content, re.IGNORECASE | re.MULTILINE)
        for match in matches:
            all_matches.append(match.group(0).strip())
    
    # Remove duplicates while preserving order
    unique_commands = []
    seen = set()
    for cmd in all_matches:
        normalized = re.sub(r'\s+', ' ', cmd)
        if normalized not in seen and len(normalized) > 5:
            seen.add(normalized)
            unique_commands.append(normalized)
    
    count = len(unique_commands)
    
    # Optional: Use CTI-BERT embeddings to validate commands
    validated_count = count
    confidence = 1.0
    
    if use_embedding_validation and count > 0:
        try:
            extractor = CTIBERTNERExtractor(use_gpu=True)
            
            # Generate embedding for article content (for context)
            content_sample = filtered_content[:1000]
            content_embedding = extractor._get_embedding(content_sample)
            
            # Generate embedding for cybersecurity context
            cti_context_embedding = extractor._get_embedding("cybersecurity threat intelligence malware attack command execution")
            
            # Calculate similarity
            similarity = torch.nn.functional.cosine_similarity(
                content_embedding.unsqueeze(0),
                cti_context_embedding.unsqueeze(0)
            ).item()
            
            # Adjust confidence based on similarity
            if similarity < 0.3:
                confidence = 0.7
            elif similarity < 0.5:
                confidence = 0.85
            else:
                confidence = 0.95
            
        except Exception as e:
            confidence = 0.8
    
    return {
        'count': count,
        'validated_count': validated_count,
        'confidence': confidence,
        'parse_success': True,
        'raw_matches': unique_commands[:10],
        'total_matches_found': len(unique_commands),
        'error': None
    }

# ============================================================================
# MAIN EXECUTION
# ============================================================================


## 8. Run Tests

In [None]:
# Run tests for all article-model combinations
results = []

# Test LLM models
for article in articles:
    for model_key in LLM_MODELS_TO_TEST:
        print(f"Testing Article {article['id']} with LLM {model_key}...", end=" ")
        
        result = await count_commandline_with_llm(
            article_content=article['content'],
            model_key=model_key,
            temperature=TEMPERATURE,
            seed=SEED,
            junk_filter_threshold=JUNK_FILTER_THRESHOLD,
            article_id=article['id']
        )
        
        results.append({
            'article_id': article['id'],
            'article_title': article['title'],
            'article_url': article['url'],
            'model_key': model_key,
            'model_description': LLM_MODELS[model_key]['description'],
            'model_type': 'llm',
            'provider': LLM_MODELS[model_key]['provider'],
            'commandline_count': result.get('count'),
            'parse_success': result.get('parse_success', False),
            'error': result.get('error'),
            'raw_response': result.get('raw_response', '')[:200],
            'usage': result.get('usage', {})
        })
        
        status = "✅" if result.get('parse_success') else "❌"
        count = result.get('count', 'N/A')
        print(f"{status} Count: {count}")

# Test embedding models
for article in articles:
    for model_key in EMBEDDING_MODELS_TO_TEST:
        print(f"Testing Article {article['id']} with Embedding {model_key}...", end=" ")
        
        result = count_commandline_with_ctibert(
            article_content=article['content'],
            model_key=model_key,
            junk_filter_threshold=JUNK_FILTER_THRESHOLD,
            article_id=article['id'],
            use_embedding_validation=True
        )
        
        results.append({
            'article_id': article['id'],
            'article_title': article['title'],
            'article_url': article['url'],
            'model_key': model_key,
            'model_description': EMBEDDING_MODELS[model_key]['description'],
            'model_type': 'embedding',
            'provider': 'cti-bert',
            'commandline_count': result.get('count'),
            'validated_count': result.get('validated_count'),
            'confidence': result.get('confidence'),
            'parse_success': result.get('parse_success', False),
            'error': result.get('error'),
            'raw_matches': result.get('raw_matches', []),
            'usage': {}
        })
        
        status = "✅" if result.get('parse_success') else "❌"
        count = result.get('count', 'N/A')
        confidence = result.get('confidence', 'N/A')
        print(f"{status} Count: {count} (confidence: {confidence})")

print(f"\n✅ Completed {len(results)} tests")

## 9. Display Results

In [None]:
# Create DataFrame for easy viewing
df = pd.DataFrame(results)

# Separate LLM and embedding results
llm_df = df[df['model_type'] == 'llm'].copy()
embedding_df = df[df['model_type'] == 'embedding'].copy()

# Pivot table for LLM models
if len(llm_df) > 0:
    llm_pivot = llm_df.pivot_table(
        index=['article_id', 'article_title'],
        columns='model_description',
        values='commandline_count',
        aggfunc='first'
    )
    
    print("\n" + "="*80)
    print("LLM MODELS - COMMANDLINE COUNTS BY ARTICLE")
    print("="*80)
    display(llm_pivot)

# Pivot table for embedding models
if len(embedding_df) > 0:
    embedding_pivot = embedding_df.pivot_table(
        index=['article_id', 'article_title'],
        columns='model_description',
        values='commandline_count',
        aggfunc='first'
    )
    
    print("\n" + "="*80)
    print("EMBEDDING MODELS - COMMANDLINE COUNTS BY ARTICLE")
    print("="*80)
    display(embedding_pivot)

# Combined comparison
if len(llm_df) > 0 and len(embedding_df) > 0:
    combined_pivot = df.pivot_table(
        index=['article_id', 'article_title'],
        columns='model_description',
        values='commandline_count',
        aggfunc='first'
    )
    
    print("\n" + "="*80)
    print("COMBINED COMPARISON - ALL MODELS")
    print("="*80)
    display(combined_pivot)

## 10. Export Results

In [None]:
# Export results to JSON
output_dir = PROJECT_ROOT / "outputs" / "notebook_results"
output_dir.mkdir(parents=True, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = output_dir / f"commandline_observables_{timestamp}.json"

export_data = {
    'timestamp': timestamp,
    'test_config': {
        'temperature': TEMPERATURE,
        'seed': SEED,
        'junk_filter_threshold': JUNK_FILTER_THRESHOLD,
        'llm_models_tested': LLM_MODELS_TO_TEST,
        'embedding_models_tested': EMBEDDING_MODELS_TO_TEST,
        'article_ids': [a['id'] for a in articles]
    },
    'results': results
}

with open(output_file, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"✅ Results exported to: {output_file}")

# Also export as CSV
csv_file = output_dir / f"commandline_observables_{timestamp}.csv"
df.to_csv(csv_file, index=False)
print(f"✅ CSV exported to: {csv_file}")