# LLM Testing Notebook
Test different models via web API

In [1]:
import httpx
import os
from datetime import datetime
import csv

# Function to get article from API
def get_article_from_api(article_id):
    api_url = f"http://localhost:8001/api/articles/{article_id}"
    try:
        response = httpx.get(api_url, timeout=10.0)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"❌ API Error: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"❌ Connection error: {e}")
        return None

# Test connection
print("Testing API connection...")
article = get_article_from_api(68)
if article:
    print(f"✅ Connected! Article: {article['title'][:50]}...")
else:
    print("❌ Connection failed")
import time


Testing API connection...
✅ Connected! Article: Velociraptor WSUS Exploitation, Pt. I: WSUS-Up?...


In [2]:
# Get available models from LMStudio
def get_lmstudio_models():
    lmstudio_url = "http://localhost:1234/v1"
    try:
        response = httpx.get(f"{lmstudio_url}/models", timeout=5.0)
        if response.status_code == 200:
            models_data = response.json()
            return [m["id"] for m in models_data.get("data", [])]
        else:
            print(f"❌ LMStudio error: {response.status_code}")
            return []
    except Exception as e:
        print(f"❌ LMStudio connection error: {e}")
        return []

# Test LMStudio
print("Testing LMStudio connection...")
models = get_lmstudio_models()
print(f"✅ Found {len(models)} models")
if models:
    print(f"First model: {models[0]}")

Testing LMStudio connection...
✅ Found 20 models
First model: openai/gpt-oss-20b


In [3]:
# Test LLM call
def test_llm(model, prompt, article):
    lmstudio_url = "http://localhost:1234/v1"
    
    full_prompt = f"""Article Title: {article['title']}
Article URL: {article.get('canonical_url', '')}

Article Content:
{article['content'][:4000]}...

Task: {prompt}"""
    
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": full_prompt}],
        "temperature": 0.7,
        "max_tokens": 1000
    }
    
    start = time.perf_counter()
    try:
        response = httpx.post(f"{lmstudio_url}/chat/completions", json=payload, timeout=60.0)
        duration = time.perf_counter() - start
        if response.status_code == 200:
            result = response.json()
            content = result["choices"][0]["message"]["content"]
            return {"status": "success", "response": content, "length": len(content), "duration_sec": duration}
        else:
            return {"status": "error", "response": f"API Error: {response.status_code}", "length": 0, "duration_sec": duration}
    except Exception as e:
        duration = time.perf_counter() - start
        return {"status": "error", "response": f"Exception: {str(e)}", "length": 0, "duration_sec": duration}


🤖 Testing model: openai/gpt-oss-20b
✅ SUCCESS!
Response: **Extracted Command‑Line Statements**

```json
{
  "cmdlines": [
    "\"C:\\Program Files\\Velociraptor\\Velociraptor.exe\" --config \"C:\\Program Files\\Velociraptor\\/client.config.yaml\" service ru...


# Enhanced LLM Testing
Select multiple models, custom articles, and get CSV output

In [4]:
from pathlib import Path

# Load default CmdlineExtract prompt from repo files; fall back to a short default if missing.
def load_cmdline_prompt():
    candidates = [
        Path("src/prompts/CmdlineExtract"),
        Path("prompts/CmdlineExtract"),
    ]
    for p in candidates:
        if p.exists():
            return p.read_text().strip()
    return """You are a specialized extraction agent focused on extracting explicit Windows command-line observables from threat intelligence articles. Extract only literal Windows command lines with an executable/system utility plus at least one argument/switch/parameter/pipeline/redirection. Respond with a JSON object: {"cmdline_items": [...], "count": <int>, "qa_corrections": {"removed": [], "added": [], "summary": "None."}}. Use double backslashes for Windows paths."""

# Load CmdLine QA prompt (mirrors workflow QA agent)
def load_cmdline_qa_prompt():
    candidates = [
        Path("src/prompts/CmdLineQA"),
        Path("prompts/CmdLineQA"),
    ]
    for p in candidates:
        if p.exists():
            return p.read_text().strip()
    return "Review extracted command-lines. If any are invalid or missing, provide added/removed entries in qa_corrections and return the corrected list. Respond with JSON matching {\"cmdline_items\": [...], \"count\": <int>, \"qa_corrections\": {\"removed\": [], \"added\": [], \"summary\": \"None.\"}}."

CMDLINE_PROMPT = load_cmdline_prompt()
CMDLINE_QA_PROMPT = load_cmdline_qa_prompt()


In [5]:
import pandas as pd
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output

# Global results
results_df = pd.DataFrame(columns=["timestamp", "agent", "article_id", "article_title", "model", "prompt", "response", "response_length", "duration_sec", "status"])

# UI Widgets
article_input = widgets.Text(
    value="68",
    placeholder="Enter article ID",
    description="Article ID:",
    layout=widgets.Layout(width="200px")
)

model_select = widgets.SelectMultiple(
    options=models if "models" in globals() else [],
    description="Cmd Models:",
    layout=widgets.Layout(width="100%", height="150px")
)

prompt_input = widgets.Textarea(
    value=CMDLINE_PROMPT,
    placeholder="Enter your prompt here...",
    description="Cmd Prompt:",
    layout=widgets.Layout(width="100%", height="120px")
)

qa_model_select = widgets.SelectMultiple(
    options=models if "models" in globals() else [],
    description="QA Models:",
    layout=widgets.Layout(width="100%", height="150px")
)

qa_prompt_input = widgets.Textarea(
    value=CMDLINE_QA_PROMPT,
    placeholder="Enter QA prompt...",
    description="QA Prompt:",
    layout=widgets.Layout(width="100%", height="120px")
)

run_button = widgets.Button(
    description="Run Cmd Tests",
    button_style="primary",
    tooltip="Run LLM tests with selected parameters"
)

qa_run_button = widgets.Button(
    description="Run QA Tests",
    button_style="info",
    tooltip="Run QA tests with selected parameters"
)

csv_button = widgets.Button(
    description="Save CSV",
    button_style="success",
    tooltip="Save results to CSV file"
)

output_area = widgets.Output()

print("Enhanced testing interface ready!")

# Holds latest cmdline extraction outputs for QA chaining
last_cmdline_outputs = []


Enhanced testing interface ready!


In [6]:
# Enhanced testing functions

def run_enhanced_tests(article_id, selected_models, prompt):
    global results_df, last_cmdline_outputs
    last_cmdline_outputs = []
    
    print(f"Getting article {article_id}...")
    article = get_article_from_api(article_id)
    if not article:
        print(f"❌ Article {article_id} not found")
        return
    
    print(f"✅ Article: {article['title'][:60]}...")
    print(f"📏 Content: {len(article['content'])} chars")
    print(f"🤖 Testing {len(selected_models)} models × 1 prompt (Cmdline Extract)")
    print()
    
    for i, model in enumerate(selected_models, 1):
        model_name = model.split('/')[-1]
        print(f"🧠 Cmd Agent Model {i}/{len(selected_models)}: {model_name}")
        
        result = test_llm(model, prompt, article)
        
        results_df.loc[len(results_df)] = {
            "timestamp": datetime.now().isoformat(),
            "agent": "CmdlineExtract",
            "article_id": article_id,
            "article_title": article['title'][:100],
            "model": model,
            "prompt": prompt,
            "response": result['response'],
            "response_length": result['length'],
            "duration_sec": result.get('duration_sec', 0.0),
            "status": result['status']
        }
        
        if result['status'] == 'success':
            last_cmdline_outputs.append({
                "model": model,
                "response": result['response']
            })
            print(f"    ✅ Success ({result['length']} chars) in {result.get('duration_sec',0):.2f}s")
        else:
            print(f"    ❌ {result['response']}")
    
    print(f"📊 Completed {len(selected_models)} Cmd tests!")
    print(f"Results stored in DataFrame ({len(results_df)} total rows)")


def run_cmdline_qa_tests(article_id, selected_models, prompt):
    global results_df, last_cmdline_outputs
    
    print(f"Getting article {article_id} for QA...")
    article = get_article_from_api(article_id)
    if not article:
        print(f"❌ Article {article_id} not found")
        return
    
    if not last_cmdline_outputs:
        print("⚠️ No extractor outputs available. Run Cmdline Extract tests first.")
    
    extraction_context = "\n\n".join(
        [f"Model: {item['model']}\nOutput:\n{item['response']}" for item in last_cmdline_outputs]
    ) if last_cmdline_outputs else "No extractor outputs captured."
    
    print(f"🤖 Testing {len(selected_models)} models × 1 prompt (CmdLine QA)")
    print()
    
    for i, model in enumerate(selected_models, 1):
        model_name = model.split('/')[-1]
        print(f"🧠 QA Agent Model {i}/{len(selected_models)}: {model_name}")
        
        qa_prompt_with_context = f"{prompt}\n\nExtracted command-lines to evaluate:\n{extraction_context}"
        result = None
        max_attempts = 2
        for attempt in range(1, max_attempts + 1):
            result = test_llm(model, qa_prompt_with_context, article)
            if result.get('status') == 'success' and result.get('response', '').strip():
                break
            if attempt < max_attempts:
                print(f"    ⚠️ QA attempt {attempt} returned empty/failed (status={result.get('status')}, len={len(result.get('response',''))}); retrying...")
        
        results_df.loc[len(results_df)] = {
            "timestamp": datetime.now().isoformat(),
            "agent": "CmdLineQA",
            "article_id": article_id,
            "article_title": article['title'][:100],
            "model": model,
            "prompt": qa_prompt_with_context,
            "response": result.get('response', ''),
            "response_length": result.get('length', 0),
            "duration_sec": result.get('duration_sec', 0.0),
            "status": result.get('status')
        }
        
        if result.get('status') == 'success':
            print(f"    ✅ QA Success ({result.get('length',0)} chars) in {result.get('duration_sec',0):.2f}s")
        else:
            print(f"    ❌ {result.get('response','')}")
    
    print(f"📊 Completed {len(selected_models)} QA tests!")
    print(f"Results stored in DataFrame ({len(results_df)} total rows)")

# Button handlers
def on_run_clicked(b):
    with output_area:
        clear_output()
        article_id = article_input.value.strip()
        selected_models = list(model_select.value)
        prompt = prompt_input.value.strip()
        
        if not article_id:
            print("❌ Please enter an article ID")
            return
        if not selected_models:
            print("❌ Please select at least one model")
            return
        if not prompt:
            print("❌ Please enter a prompt")
            return
        
        print("🚀 Starting Cmdline Extract tests...")
        run_enhanced_tests(article_id, selected_models, prompt)


def on_qa_run_clicked(b):
    with output_area:
        clear_output()
        article_id = article_input.value.strip()
        selected_models = list(qa_model_select.value)
        prompt = qa_prompt_input.value.strip()
        
        if not article_id:
            print("❌ Please enter an article ID")
            return
        if not selected_models:
            print("❌ Please select at least one QA model")
            return
        if not prompt:
            print("❌ Please enter a QA prompt")
            return
        
        print("🚀 Starting CmdLine QA tests...")
        run_cmdline_qa_tests(article_id, selected_models, prompt)


def on_csv_clicked(b):
    with output_area:
        if results_df.empty:
            print("❌ No results to save")
            return
        
        filename = f"llm_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        results_df.to_csv(filename, index=False)
        print(f"✅ Saved {len(results_df)} results to {filename}")

# Connect handlers
run_button.on_click(on_run_clicked)
qa_run_button.on_click(on_qa_run_clicked)
csv_button.on_click(on_csv_clicked)

print("Button handlers connected!")


Button handlers connected!


In [7]:
# Display enhanced UI
display(widgets.VBox([
    widgets.HTML("<h3>🚀 Enhanced LLM Testing</h3>"),
    article_input,
    widgets.HTML("<strong>Cmdline Extract</strong>"),
    model_select,
    prompt_input,
    widgets.HBox([run_button, csv_button]),
    widgets.HTML("<strong>CmdLine QA</strong>"),
    qa_model_select,
    qa_prompt_input,
    qa_run_button,
    output_area
]))

print("Enhanced UI ready! Select your options and click Run Tests.")


VBox(children=(HTML(value='<h3>🚀 Enhanced LLM Testing</h3>'), Text(value='68', description='Article ID:', layo…

Enhanced UI ready! Select your options and click Run Tests.


In [None]:
# View current results
if not results_df.empty:
    print(f"📊 Current results: {len(results_df)} tests")
    display(results_df)
else:
    print("No results yet. Run some tests first!")