# LLM Testing Notebook
Test different models via web API

In [15]:
!pip install sentence-transformers

import httpx
import json
import os
from datetime import datetime
import csv

# Function to get article from API
def get_article_from_api(article_id):
    api_url = f"http://localhost:8001/api/articles/{article_id}"
    try:
        response = httpx.get(api_url, timeout=10.0)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"‚ùå API Error: {response.status_code} - {response.text}")
            return None
    except Exception as e:
        print(f"‚ùå Connection error: {e}")
        return None

# Test connection
print("Testing API connection...")
article = get_article_from_api(68)
if article:
    print(f"‚úÖ Connected! Article: {article['title'][:50]}...")
else:
    print("‚ùå Connection failed")
import time



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Testing API connection...
‚úÖ Connected! Article: Velociraptor WSUS Exploitation, Pt. I: WSUS-Up?...


In [16]:
# Get available models from LMStudio
def get_lmstudio_models():
    lmstudio_url = "http://localhost:1234/v1"
    try:
        response = httpx.get(f"{lmstudio_url}/models", timeout=5.0)
        if response.status_code == 200:
            models_data = response.json()
            return [m["id"] for m in models_data.get("data", [])]
        else:
            print(f"‚ùå LMStudio error: {response.status_code}")
            return []
    except Exception as e:
        print(f"‚ùå LMStudio connection error: {e}")
        return []

# Test LMStudio
print("Testing LMStudio connection...")
models = get_lmstudio_models()
print(f"‚úÖ Found {len(models)} models")
if models:
    print(f"First model: {models[0]}")

Testing LMStudio connection...
‚úÖ Found 20 models
First model: qwen/qwen3-coder-30b


In [17]:
# LangFuse tracing (optional)
import sys
from pathlib import Path
from contextlib import nullcontext

REPO_ROOT = Path(".").resolve()
if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

try:
    from src.utils.langfuse_client import (
        is_langfuse_enabled,
        trace_llm_call,
        log_llm_completion,
        log_llm_error,
        trace_workflow_execution,
        log_workflow_step,
    )
except Exception as lf_import_err:
    print(f"‚ö†Ô∏è LangFuse not available in notebook environment: {lf_import_err}")
    def is_langfuse_enabled():
        return False
    trace_llm_call = log_llm_completion = log_llm_error = trace_workflow_execution = log_workflow_step = None

langfuse_context = {}

def ensure_langfuse_context(article_id):
    if not is_langfuse_enabled():
        return None
    import hashlib
    import time

    existing = (
        langfuse_context.get("article_id") == article_id
        and langfuse_context.get("trace_id")
    )
    if existing:
        return langfuse_context

    try:
        article_id_int = int(article_id)
    except Exception:
        article_id_int = None

    execution_id = int(time.time())
    session_id = f"notebook_article_{article_id}_{execution_id}"
    trace_id = hashlib.md5(f"workflow_exec_{execution_id}".encode()).hexdigest()

    langfuse_context.update({
        "article_id": article_id,
        "article_id_int": article_id_int,
        "execution_id": execution_id,
        "session_id": session_id,
        "trace_id": trace_id
    })
    return langfuse_context


In [18]:
# Test LLM call with optional LangFuse tracing
def test_llm(model, prompt, article, call_name="llm_call", langfuse_ctx=None):
    lmstudio_url = "http://localhost:1234/v1"

    full_prompt = f"""Article Title: {article["title"]}
Article URL: {article.get("canonical_url", "")}

Article Content:
{article["content"][:4000]}...

Task: {prompt}"""

    messages = [{"role": "user", "content": full_prompt}]
    payload = {
        "model": model,
        "messages": messages,
        "temperature": 0.7,
        "max_tokens": 1000
    }

    lf_ctx = langfuse_ctx if (langfuse_ctx and is_langfuse_enabled()) else None
    lf_metadata = {
        "messages": messages,
        "call_name": call_name,
        "article_id": article.get("id") or article.get("article_id"),
        "article_title": article.get("title", ""),
    }

    start = time.perf_counter()
    generation_cm = nullcontext(None)
    if lf_ctx and trace_llm_call:
        generation_cm = trace_llm_call(
            name=call_name,
            model=model,
            execution_id=lf_ctx.get("execution_id"),
            article_id=lf_ctx.get("article_id_int"),
            trace_id=lf_ctx.get("trace_id"),
            session_id=lf_ctx.get("session_id"),
            metadata=lf_metadata
        )

    with generation_cm as generation:
        try:
            response = httpx.post(f"{lmstudio_url}/chat/completions", json=payload, timeout=60.0)
            duration = time.perf_counter() - start
            if response.status_code == 200:
                result = response.json()
                content = result["choices"][0]["message"]["content"]
                if lf_ctx and log_llm_completion:
                    log_llm_completion(generation, messages, content, metadata=lf_metadata)
                return {"status": "success", "response": content, "length": len(content), "duration_sec": duration}
            else:
                error_msg = f"API Error: {response.status_code}"
                if lf_ctx and log_llm_error:
                    log_llm_error(generation, Exception(error_msg), metadata=lf_metadata)
                return {"status": "error", "response": error_msg, "length": 0, "duration_sec": duration}
        except Exception as e:
            duration = time.perf_counter() - start
            if lf_ctx and log_llm_error:
                log_llm_error(generation, e, metadata=lf_metadata)
            return {"status": "error", "response": f"Exception: {str(e)}", "length": 0, "duration_sec": duration}


# Enhanced LLM Testing
Select multiple models, custom articles, and get CSV output

In [19]:
from pathlib import Path

# Load default CmdlineExtract prompt from repo files; fall back to a short default if missing.
def load_cmdline_prompt():
    candidates = [
        Path("src/prompts/CmdlineExtract"),
        Path("prompts/CmdlineExtract"),
    ]
    for p in candidates:
        if p.exists():
            return p.read_text().strip()
    return """You are a specialized extraction agent focused on extracting explicit Windows command-line observables from threat intelligence articles. Extract only literal Windows command lines with an executable/system utility plus at least one argument/switch/parameter/pipeline/redirection. Respond with a JSON object: {"cmdline_items": [...], "count": <int>, "qa_corrections": {"removed": [], "added": [], "summary": "None."}}. Use double backslashes for Windows paths."""

# Load CmdLine QA prompt (mirrors workflow QA agent)
def load_cmdline_qa_prompt():
    candidates = [
        Path("src/prompts/CmdLineQA"),
        Path("prompts/CmdLineQA"),
    ]
    for p in candidates:
        if p.exists():
            return p.read_text().strip()
    return "Review extracted command-lines. If any are invalid or missing, provide added/removed entries in qa_corrections and return the corrected list. Respond with JSON matching {\"cmdline_items\": [...], \"count\": <int>, \"qa_corrections\": {\"removed\": [], \"added\": [], \"summary\": \"None.\"}}."

CMDLINE_PROMPT = load_cmdline_prompt()
CMDLINE_QA_PROMPT = load_cmdline_qa_prompt()

# Extract count from model response JSON (only when properly formatted)
def extract_count(response_text):
    if not response_text or not isinstance(response_text, str):
        return None
    text = response_text.strip()
    if text.startswith("```"):
        parts = text.split("\n")
        if parts:
            parts = parts[1:]
            if parts and parts[-1].strip().startswith("```"):
                parts = parts[:-1]
            text = "\n".join(parts).strip()
    try:
        data = json.loads(text)
    except Exception:
        return None
    count = data.get("count")
    if isinstance(count, int):
        return count
    items = data.get("cmdline_items")
    if isinstance(items, list):
        return len(items)
    return None


In [20]:
import pandas as pd
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output

# Global results
results_df = pd.DataFrame(columns=["timestamp", "agent", "article_id", "article_title", "model", "prompt", "response", "response_length", "duration_sec", "status", "count"])

# UI Widgets
article_input = widgets.Text(
    value="68",
    placeholder="Enter article ID",
    description="Article ID:",
    layout=widgets.Layout(width="200px")
)

model_select = widgets.SelectMultiple(
    options=models if "models" in globals() else [],
    description="Cmd Models:",
    layout=widgets.Layout(width="100%", height="150px")
)

prompt_input = widgets.Textarea(
    value=CMDLINE_PROMPT,
    placeholder="Enter your prompt here...",
    description="Cmd Prompt:",
    layout=widgets.Layout(width="100%", height="120px")
)

qa_model_select = widgets.SelectMultiple(
    options=models if "models" in globals() else [],
    description="QA Models:",
    layout=widgets.Layout(width="100%", height="150px")
)

qa_prompt_input = widgets.Textarea(
    value=CMDLINE_QA_PROMPT,
    placeholder="Enter QA prompt...",
    description="QA Prompt:",
    layout=widgets.Layout(width="100%", height="120px")
)

run_button = widgets.Button(
    description="Run Cmd Tests",
    button_style="primary",
    tooltip="Run LLM tests with selected parameters"
)

qa_run_button = widgets.Button(
    description="Run QA Tests",
    button_style="info",
    tooltip="Run QA tests with selected parameters"
)

csv_button = widgets.Button(
    description="Save CSV",
    button_style="success",
    tooltip="Save results to CSV file"
)

output_area = widgets.Output()

print("Enhanced testing interface ready!")

# Holds latest cmdline extraction outputs for QA chaining
last_cmdline_outputs = []


Enhanced testing interface ready!


In [21]:
# Enhanced testing functions

def run_enhanced_tests(article_id, selected_models, prompt):
    global results_df, last_cmdline_outputs
    last_cmdline_outputs = []
    lf_ctx = ensure_langfuse_context(article_id)
    trace_cm = trace = None
    try:
        if lf_ctx and trace_workflow_execution:
            trace_cm = trace_workflow_execution(
                execution_id=lf_ctx.get("execution_id"),
                article_id=lf_ctx.get("article_id_int") or 0,
                session_id=lf_ctx.get("session_id"),
                user_id="notebook_llm_tester"
            )
            trace = trace_cm.__enter__()

        print(f"Getting article {article_id}...")
        article = get_article_from_api(article_id)
        if not article:
            print(f"‚ùå Article {article_id} not found")
            return

        print(f"‚úÖ Article: {article['title'][:60]}...")
        print(f"üìè Content: {len(article['content'])} chars")
        print(f"ü§ñ Testing {len(selected_models)} models √ó 1 prompt (Cmdline Extract)")
        print()

        for i, model in enumerate(selected_models, 1):
            model_name = model.split('/')[-1]
            print(f"üß† Cmd Agent Model {i}/{len(selected_models)}: {model_name}")

            result = test_llm(model, prompt, article, call_name="cmdline_extract", langfuse_ctx=lf_ctx)

            results_df.loc[len(results_df)] = {
                "timestamp": datetime.now().isoformat(),
                "agent": "CmdlineExtract",
                "article_id": article_id,
                "article_title": article['title'][:100],
                "model": model,
                "prompt": prompt,
                "response": result['response'],
                "response_length": result['length'],
                "count": extract_count(result.get('response', '')),
                "duration_sec": result.get('duration_sec', 0.0),
                "status": result['status']
            }

            if trace and log_workflow_step:
                log_workflow_step(
                    trace,
                    f"cmdline_extract_{model_name}",
                    step_result={"status": result['status'], "count": extract_count(result.get('response', '')), "duration_sec": result.get('duration_sec', 0.0)},
                    metadata={"model": model}
                )

            if result['status'] == 'success':
                last_cmdline_outputs.append({
                    "model": model,
                    "response": result['response']
                })
                print(f"    ‚úÖ Success ({result['length']} chars) in {result.get('duration_sec',0):.2f}s")
            else:
                print(f"    ‚ùå {result['response']}")

        print(f"üìä Completed {len(selected_models)} Cmd tests!")
        print(f"Results stored in DataFrame ({len(results_df)} total rows)")
    finally:
        if trace_cm:
            trace_cm.__exit__(None, None, None)


def run_cmdline_qa_tests(article_id, selected_models, prompt):
    global results_df, last_cmdline_outputs
    lf_ctx = ensure_langfuse_context(article_id)
    trace_cm = trace = None
    try:
        if lf_ctx and trace_workflow_execution:
            trace_cm = trace_workflow_execution(
                execution_id=lf_ctx.get("execution_id"),
                article_id=lf_ctx.get("article_id_int") or 0,
                session_id=lf_ctx.get("session_id"),
                user_id="notebook_llm_tester"
            )
            trace = trace_cm.__enter__()

        print(f"Getting article {article_id} for QA...")
        article = get_article_from_api(article_id)
        if not article:
            print(f"‚ùå Article {article_id} not found")
            return

        if not last_cmdline_outputs:
            print("‚ö†Ô∏è No extractor outputs available. Run Cmdline Extract tests first.")

        extraction_context = "\n\n".join(
            [f"Model: {item['model']}\nOutput:\n{item['response']}" for item in last_cmdline_outputs]
        ) if last_cmdline_outputs else "No extractor outputs captured."
        

        print(f"ü§ñ Testing {len(selected_models)} models √ó 1 prompt (CmdLine QA)")
        print()

        if trace and log_workflow_step:
            log_workflow_step(
                trace,
                "cmdline_qa_context",
                step_result={
                    "has_extractor_outputs": bool(last_cmdline_outputs),
                    "extractor_models": [item['model'] for item in last_cmdline_outputs]
                },
                metadata={"agent": "CmdLineQA"}
            )

        for i, model in enumerate(selected_models, 1):
            model_name = model.split('/')[-1]
            print(f"üß† QA Agent Model {i}/{len(selected_models)}: {model_name}")

            qa_prompt_with_context = f"""{prompt}

Extracted command-lines to evaluate:
{extraction_context}"""
            result = None
            max_attempts = 2
            for attempt in range(1, max_attempts + 1):
                result = test_llm(model, qa_prompt_with_context, article, call_name="cmdline_qa", langfuse_ctx=lf_ctx)
                if result.get('status') == 'success' and result.get('response', '').strip():
                    break
                if attempt < max_attempts:
                    print(f"    ‚ö†Ô∏è QA attempt {attempt} returned empty/failed (status={result.get('status')}, len={len(result.get('response',''))}); retrying...")

            results_df.loc[len(results_df)] = {
                "timestamp": datetime.now().isoformat(),
                "agent": "CmdLineQA",
                "article_id": article_id,
                "article_title": article['title'][:100],
                "model": model,
                "prompt": qa_prompt_with_context,
                "response": result.get('response', ''),
                "count": extract_count(result.get('response', '')),
                "response_length": result.get('length', 0),
                "duration_sec": result.get('duration_sec', 0.0),
                "status": result.get('status')
            }

            if trace and log_workflow_step:
                log_workflow_step(
                    trace,
                    f"cmdline_qa_{model_name}",
                    step_result={"status": result.get('status'), "count": extract_count(result.get('response', '')), "duration_sec": result.get('duration_sec', 0.0)},
                    metadata={"model": model}
                )

            if result.get('status') == 'success':
                print(f"    ‚úÖ QA Success ({result.get('length',0)} chars) in {result.get('duration_sec',0):.2f}s")
            else:
                print(f"    ‚ùå {result.get('response','')}")

        print(f"üìä Completed {len(selected_models)} QA tests!")
        print(f"Results stored in DataFrame ({len(results_df)} total rows)")
    finally:
        if trace_cm:
            trace_cm.__exit__(None, None, None)

# Button handlers
def on_run_clicked(b):
    with output_area:
        clear_output()
        article_id = article_input.value.strip()
        selected_models = list(model_select.value)
        prompt = prompt_input.value.strip()

        if not article_id:
            print("‚ùå Please enter an article ID")
            return
        if not selected_models:
            print("‚ùå Please select at least one model")
            return
        if not prompt:
            print("‚ùå Please enter a prompt")
            return

        print("üöÄ Starting Cmdline Extract tests...")
        run_enhanced_tests(article_id, selected_models, prompt)


def on_qa_run_clicked(b):
    with output_area:
        clear_output()
        article_id = article_input.value.strip()
        selected_models = list(qa_model_select.value)
        prompt = qa_prompt_input.value.strip()

        if not article_id:
            print("‚ùå Please enter an article ID")
            return
        if not selected_models:
            print("‚ùå Please select at least one QA model")
            return
        if not prompt:
            print("‚ùå Please enter a QA prompt")
            return

        print("üöÄ Starting CmdLine QA tests...")
        run_cmdline_qa_tests(article_id, selected_models, prompt)


def on_csv_clicked(b):
    with output_area:
        if results_df.empty:
            print("‚ùå No results to save")
            return

        filename = f"llm_comparison_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + ".csv"
        results_df.to_csv(filename, index=False)
        print(f"‚úÖ Saved {len(results_df)} results to {filename}")

# Connect handlers
run_button.on_click(on_run_clicked)
qa_run_button.on_click(on_qa_run_clicked)
csv_button.on_click(on_csv_clicked)

print("Button handlers connected!")


Button handlers connected!


In [22]:
# Display enhanced UI
display(widgets.VBox([
    widgets.HTML("<h3>üöÄ Enhanced LLM Testing</h3>"),
    article_input,
    widgets.HTML("<strong>Cmdline Extract</strong>"),
    model_select,
    prompt_input,
    widgets.HBox([run_button, csv_button]),
    widgets.HTML("<strong>CmdLine QA</strong>"),
    qa_model_select,
    qa_prompt_input,
    qa_run_button,
    output_area
]))

print("Enhanced UI ready! Select your options and click Run Tests.")


VBox(children=(HTML(value='<h3>üöÄ Enhanced LLM Testing</h3>'), Text(value='68', description='Article ID:', layo‚Ä¶

Enhanced UI ready! Select your options and click Run Tests.


In [23]:
# View current results
if not results_df.empty:
    print(f"üìä Current results: {len(results_df)} tests")
    display(results_df)
else:
    print("No results yet. Run some tests first!")

No results yet. Run some tests first!


In [24]:
# Hybrid cmdline extraction on Article 68 using current YAML patterns
import json
from pathlib import Path
import subprocess

from src.extractors.regex_windows import extract_candidate_lines
from src.extractors.encoder_classifier import classify_candidates
from src.extractors.hybrid_cmdline_extractor import literal_filter, is_qa_enabled
from src.extractors.qa_validator import qa_validate

def get_article_content(article_id: int) -> str:
    cmd = [
        "docker", "exec", "-i", "cti_postgres",
        "psql", "-U", "cti_user", "-d", "cti_scraper",
        "-t", "-A", "-c", f"SELECT content FROM articles WHERE id={article_id};"
    ]
    return subprocess.check_output(cmd).decode().strip()

def run_hybrid(article_id: int = 68):
    content = get_article_content(article_id)
    candidates = extract_candidate_lines(content)
    filtered = classify_candidates(candidates)
    literal = literal_filter(filtered, content)
    qa_enabled = is_qa_enabled()
    final = qa_validate(literal, content) if qa_enabled else literal

    return {
        "cmdline_items": final,
        "count": len(final),
        "qa_enabled": qa_enabled,
        "candidates": candidates,
        "filtered": filtered,
        "literal": literal,
    }

result = run_hybrid(68)
result


{'cmdline_items': [],
 'count': 0,
 'qa_enabled': False,
 'candidates': ['WSUSService.exe before running the Windows installer (as seen via a Service Control Manager/7036 message stating Windows Installer,running ) and then installing a malicious MSI package from s3[.]wasabisys[.]com . Figure 1 illustrates the results of searching for the domain on VirusTotal. Figure 1: VirusTotal search for the domain wasabisys[.]com linked to the malicious MSI package As seen in the Windows Event Log records below, the threat actor then installed Velociraptor, which was configured to communicate with the endpoint update[.]githubtestbak[.]workers[.]dev . MsiInstaller/1040;https://s3.wasabisys.com/kiessler/v4.msi,3348,(NULL),(NULL),(NULL),(NULL), Service Control Manager/7045;Velociraptor Service,"C:\\Program Files\\Velociraptor\\Velociraptor.exe" --config "C:\\Program Files\\Velociraptor\\/client.config.yaml" service run ,user mode service,auto start,LocalSystem Service Control Manager/7036;Velocirapto

In [25]:
from src.extractors import regex_windows
from src.extractors.regex_windows import extract_candidate_lines
import subprocess

def _get_article_content(article_id: int) -> str:
    cmd = [
        'docker','exec','-i','cti_postgres',
        'psql','-U','cti_user','-d','cti_scraper',
        '-t','-A','-c', f'SELECT content FROM articles WHERE id={article_id};'
    ]
    return subprocess.check_output(cmd).decode().strip()

def split_log_blob(text: str) -> str:
    for sep in [';','"','\n',',']:
        text = text.replace(sep, '\n')
    return text

content = _get_article_content(68)
preprocessed = split_log_blob(content)
candidates = extract_candidate_lines(preprocessed)

print(regex_windows.DEFAULT_PATTERNS)
print(regex_windows._load_external_patterns())
print(candidates)


{'exe_with_args': '(?:"?[A-Za-z]:\\\\+[^"\\s]+\\.\\w{3,4}"?(?:\\s+[^\\r\\n]+))', 'bare_exe_with_args': '(?:[A-Za-z0-9_\\-]+\\.exe)(?:\\s+[^\\r\\n]+)', 'powershell': '(?:powershell(?:\\.exe)?)\\s+[^\\r\\n]+', 'system32_utils': '(?:"?C:\\\\+Windows\\\\+System32\\\\+(?:net|ipconfig|setspn|quser)\\.exe"?\\s+[^\\r\\n]+)', 'quoted_with_spaces': '"[A-Za-z]:\\\\+[^"\\r\\n]+?\\.\\w{3,4}"(?:\\s+[^\\r\\n]+)'}
{'exe_with_args': '"?[A-Za-z]:\\\\\\\\[^"\\r\\n]+\\.\\w{2,5}"?(?:\\s+[^\\r\\n]+)', 'bare_exe_with_args': '(?i)(?:^|\\s)([A-Za-z0-9_\\-]+\\.exe)(?:\\s+[^\\r\\n]+)', 'powershell': '(?i)(?:"?[A-Za-z]:\\\\\\\\[^"\\r\\n]*\\\\(?:powershell|pwsh)(?:\\.exe)?"?)\\s+[^\\r\\n]+', 'system32_utils': '(?i)"?C:\\\\\\\\Windows\\\\\\\\System32\\\\\\\\(?:net|ipconfig|setspn|quser)\\.exe"?\\s+[^\\r\\n]+', 'quoted_with_spaces': '"[A-Za-z]:\\\\\\\\[^"\\r\\n]+\\.\\w{2,5}"(?:\\s+[^\\r\\n]+)'}
['WSUSService.exe before running the Windows installer (as seen via a Service Control Manager/7036 message stating Windows 

In [44]:
# Hybrid cmdline extraction with YAML overrides + log-line splitting (updated)
import subprocess
from src.extractors.regex_windows import extract_candidate_lines
from src.extractors.encoder_classifier import classify_candidates
from src.extractors.hybrid_cmdline_extractor import literal_filter, is_qa_enabled
from src.extractors.qa_validator import qa_validate

def get_article_content(article_id: int) -> str:
    cmd = [
        'docker', 'exec', '-i', 'cti_postgres',
        'psql', '-U', 'cti_user', '-d', 'cti_scraper',
        '-t', '-A', '-c', f"SELECT content FROM articles WHERE id={article_id};"
    ]
    return subprocess.check_output(cmd).decode().strip()

def split_log_blob(text: str) -> str:
    # Break long log rows into smaller tokens so regexes can match embedded commands
    for sep in [';', '"', '\n', ',']:
        text = text.replace(sep, '\n')
    return text

def run_hybrid(article_id: int = 68):
    content = get_article_content(article_id)
    preprocessed = split_log_blob(content)

    candidates = extract_candidate_lines(preprocessed)
    filtered = classify_candidates(candidates)
    literal = literal_filter(filtered, content)
    qa_enabled = is_qa_enabled()
    final = qa_validate(literal, content) if qa_enabled else literal

    return {
        'qa_enabled': qa_enabled,
        'candidates': candidates,
        'filtered': filtered,
        'literal': literal,
        'cmdline_items': final,
        'count': len(final),
    }

result = run_hybrid(243)
result


{'qa_enabled': False,
 'candidates': ['node.exe\n) and actor_process_command_line contains',
  'node.exe\n ) and actor_process_command_line contains',
  'curl.exe\n\n\nwget',
  'wget.exe\n\n\nwhoami',
  'arp.exe\n\n\nat.exe',
  'hostname.exe\n\n\nnbstat.exe',
  'netsh.exe\n\n\nnetstat.exe',
  'nslookup.exe\n\n\nping.exe',
  'query.exe\n\n\nsysteminfo.exe',
  'tasklist.exe\n\n\ntraceroute.exe',
  'whoami.exe\n\n\nwhois.exe',
  'quser.exe\n\n\nqwinsta.exe',
  'nltest.exe\n\n\ncsvde.exe',
  'wevtutil.exe\n\n\ndriverquery.exe',
  'nbtscan.exe\n\n\nntdsutil.exe',
  'vssadmin.exe\n\n\ndsquery.exe',
  'adfind.exe\n\n\nklist.exe',
  'vssvc.exe\n) | comp count_distinct(action_process_image_name) as num_procs',
  'curl.exe\n \n \nwget',
  'wget.exe\n \n \nwhoami',
  'arp.exe\n \n \nat.exe',
  'hostname.exe\n \n \nnbstat.exe',
  'netsh.exe\n \n \nnetstat.exe',
  'nslookup.exe\n \n \nping.exe',
  'query.exe\n \n \nsysteminfo.exe',
  'tasklist.exe\n \n \ntraceroute.exe',
  'whoami.exe\n \n \nwhois.

In [27]:
# Inspect current regex patterns and hybrid pipeline outputs for article 68
from src.extractors import regex_windows
from src.extractors.regex_windows import extract_candidate_lines
from src.extractors.encoder_classifier import classify_candidates
from src.extractors.hybrid_cmdline_extractor import literal_filter, is_qa_enabled
from src.extractors.qa_validator import qa_validate
import subprocess

def _get_article_content(article_id: int) -> str:
    cmd = [
        'docker', 'exec', '-i', 'cti_postgres',
        'psql', '-U', 'cti_user', '-d', 'cti_scraper',
        '-t', '-A', '-c', f"SELECT content FROM articles WHERE id={article_id};"
    ]
    return subprocess.check_output(cmd).decode().strip()

def split_log_blob(text: str) -> str:
    for sep in [';', '"', '\n', ',']:
        text = text.replace(sep, '\n')
    return text

content = _get_article_content(68)
preprocessed = split_log_blob(content)

print('DEFAULT_PATTERNS:', regex_windows.DEFAULT_PATTERNS)
print('YAML_PATTERNS   :', regex_windows._load_external_patterns())

candidates = extract_candidate_lines(preprocessed)
filtered = classify_candidates(candidates)
literal = literal_filter(filtered, content)
qa_enabled = is_qa_enabled()
final = qa_validate(literal, content) if qa_enabled else literal

print('\nCandidates (raw):', candidates)
print('After classifier:', filtered)
print('After literal   :', literal)
print('QA enabled      :', qa_enabled)
print('Final cmdlines  :', final)
print('Count           :', len(final))


DEFAULT_PATTERNS: {'exe_with_args': '(?:"?[A-Za-z]:\\\\+[^"\\s]+\\.\\w{3,4}"?(?:\\s+[^\\r\\n]+))', 'bare_exe_with_args': '(?:[A-Za-z0-9_\\-]+\\.exe)(?:\\s+[^\\r\\n]+)', 'powershell': '(?:powershell(?:\\.exe)?)\\s+[^\\r\\n]+', 'system32_utils': '(?:"?C:\\\\+Windows\\\\+System32\\\\+(?:net|ipconfig|setspn|quser)\\.exe"?\\s+[^\\r\\n]+)', 'quoted_with_spaces': '"[A-Za-z]:\\\\+[^"\\r\\n]+?\\.\\w{3,4}"(?:\\s+[^\\r\\n]+)'}
YAML_PATTERNS   : {'exe_with_args': '"?[A-Za-z]:\\\\\\\\[^"\\r\\n]+\\.\\w{2,5}"?(?:\\s+[^\\r\\n]+)', 'bare_exe_with_args': '(?i)(?:^|\\s)([A-Za-z0-9_\\-]+\\.exe)(?:\\s+[^\\r\\n]+)', 'powershell': '(?i)(?:"?[A-Za-z]:\\\\\\\\[^"\\r\\n]*\\\\(?:powershell|pwsh)(?:\\.exe)?"?)\\s+[^\\r\\n]+', 'system32_utils': '(?i)"?C:\\\\\\\\Windows\\\\\\\\System32\\\\\\\\(?:net|ipconfig|setspn|quser)\\.exe"?\\s+[^\\r\\n]+', 'quoted_with_spaces': '"[A-Za-z]:\\\\\\\\[^"\\r\\n]+\\.\\w{2,5}"(?:\\s+[^\\r\\n]+)'}

Candidates (raw): ['WSUSService.exe before running the Windows installer (as seen via 

In [45]:
from pathlib import Path
import subprocess

def fetch_article(article_id: int) -> str:
    cmd = [
        'docker','exec','-i','cti_postgres',
        'psql','-U','cti_user','-d','cti_scraper',
        '-t','-A','-c', f"SELECT content FROM articles WHERE id={article_id};"
    ]
    return subprocess.check_output(cmd).decode().strip()

def split_log_blob(text: str) -> str:
    # Break log blobs on separators so regex can see individual commands
    for sep in [';', '"', '\n', ',', '[', ']', '(', ')']:
        text = text.replace(sep, '\n')
    return text


content = fetch_article(243)
pre = split_log_blob(content)
candidates = extract_candidate_lines(pre)
print(f"\nFound {len(candidates)} candidates:\n")
for c in candidates:
    print("-", c)



Found 40 candidates:

- node.exe

 and actor_process_command_line contains
- node.exe
 
 and actor_process_command_line contains
- curl.exe


wget
- wget.exe


whoami
- arp.exe


at.exe
- hostname.exe


nbstat.exe
- netsh.exe


netstat.exe
- nslookup.exe


ping.exe
- query.exe


systeminfo.exe
- tasklist.exe


traceroute.exe
- whoami.exe


whois.exe
- quser.exe


qwinsta.exe
- nltest.exe


csvde.exe
- wevtutil.exe


driverquery.exe
- nbtscan.exe


ntdsutil.exe
- vssadmin.exe


dsquery.exe
- adfind.exe


klist.exe
- vssvc.exe

 | comp count_distinct
- curl.exe
 
 
wget
- wget.exe
 
 
whoami
- arp.exe
 
 
at.exe
- hostname.exe
 
 
nbstat.exe
- netsh.exe
 
 
netstat.exe
- nslookup.exe
 
 
ping.exe
- query.exe
 
 
systeminfo.exe
- tasklist.exe
 
 
traceroute.exe
- whoami.exe
 
 
whois.exe
- quser.exe
 
 
qwinsta.exe
- nltest.exe
 
 
csvde.exe
- wevtutil.exe
 
 
driverquery.exe
- nbtscan.exe
 
 
ntdsutil.exe
- vssadmin.exe
 
 
dsquery.exe
- adfind.exe
 
 
klist.exe
- vssvc.exe
 
 | comp co

In [51]:
# Play with regex patterns and see extraction output
import re
import subprocess
from src.extractors import regex_windows

def fetch_article(article_id: int) -> str:
    cmd = [
        'docker', 'exec', '-i', 'cti_postgres',
        'psql', '-U', 'cti_user', '-d', 'cti_scraper',
        '-t', '-A', '-c', f"SELECT content FROM articles WHERE id={article_id};"
    ]
    return subprocess.check_output(cmd).decode().strip()

def split_log_blob(text: str) -> str:
    for sep in [';', '"', '\n', ',']:
        text = text.replace(sep, '\n')
    return text

def extract_with_custom_patterns(text: str, overrides: dict[str, str] | None = None):
    patterns = regex_windows.DEFAULT_PATTERNS.copy()
    patterns.update(overrides or {})  # your edits win
    compiled = []
    for pat in patterns.values():
        try:
            compiled.append(re.compile(pat, re.IGNORECASE | re.MULTILINE))
        except re.error as exc:
            print(f"Skipping invalid regex: {pat} ({exc})")
    seen, out = set(), []
    for pat in compiled:
        for m in pat.finditer(text):
            val = m.group(0).strip()
            if val and val not in seen:
                seen.add(val)
                out.append(val)
    return out, patterns

# --- tweak here ---
ARTICLE_ID = 68
PATTERN_OVERRIDES = {
    # Example: loosen embedded exe matching
    # "embedded_exe": r'(?i)(?:[A-Za-z]:\\[^\s"]+|[A-Za-z0-9_\-]+\.exe)(?:\s+[^\r\n";,]+)+'

}
# ------------------

content = fetch_article(ARTICLE_ID)
pre = split_log_blob(content)
candidates, used_patterns = extract_with_custom_patterns(pre, PATTERN_OVERRIDES)

print("Patterns in use:\n", used_patterns)
print(f"\nFound {len(candidates)} candidates:\n")
for c in candidates:
    print("-", c)


Patterns in use:
 {'exe_with_args': '(?:"?[A-Za-z]:\\\\+[^"\\s]+\\.\\w{3,4}"?(?:\\s+[^\\r\\n]+))', 'bare_exe_with_args': '(?:[A-Za-z0-9_\\-]+\\.exe)(?:\\s+[^\\r\\n]+)', 'powershell': '(?:powershell(?:\\.exe)?)\\s+[^\\r\\n]+', 'system32_utils': '(?:"?C:\\\\+Windows\\\\+System32\\\\+(?:net|ipconfig|setspn|quser)\\.exe"?\\s+[^\\r\\n]+)', 'quoted_with_spaces': '"[A-Za-z]:\\\\+[^"\\r\\n]+?\\.\\w{3,4}"(?:\\s+[^\\r\\n]+)'}

Found 13 candidates:

- C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe -ExecutionPolicy Unrestricted -encodedCommand cQB1AHMAZQByAA== Each of the observed PowerShell commands varied only in the encoded commands
- C:\Windows\system32\net.exe
 group
- C:\Windows\system32\quser.exe
 
C:\Windows\system32\setspn.exe
- C:\Windows\system32\ipconfig.exe
 /al Figure 2: EDR signals showing the threat actor installing an MSI package
- WSUSService.exe before running the Windows installer (as seen via a Service Control Manager/7036 message stating Windows Installer
- Velocir

In [50]:
from pathlib import Path
from src.extractors import regex_windows

print("YAML path:", Path(regex_windows.__file__).resolve().parents[2] / "resources/regex/windows_cmd_patterns.yaml")
print("YAML contents:\n", Path("resources/regex/windows_cmd_patterns.yaml").read_text())

print("Defaults:", regex_windows.DEFAULT_PATTERNS)
print("Overrides:", regex_windows._load_external_patterns())


YAML path: /Users/starlord/CTIScraper/resources/regex/windows_cmd_patterns.yaml
YAML contents:
 patterns:
 exe_with_args: '"?[A-Za-z]:\\\\[^"\r\n]+\.\w{2,5}"?\s+[^\r\n]+'
 bare_exe_with_args: '(?i)(?:^|\s)([A-Za-z0-9_\-]+\.exe)\s+[^\r\n]+'
 powershell: '(?i)(?:"?[A-Za-z]:\\\\[^"\r\n]*\\(?:powershell|pwsh)(?:\.exe)?"?)\s+[^\r\n]+'
 system32_utils: '(?i)"?C:\\\\Windows\\\\System32\\\\(?:net|ipconfig|setspn|quser)\.exe"?\s+[^\r\n]+'
Defaults: {'exe_with_args': '(?:"?[A-Za-z]:\\\\+[^"\\s]+\\.\\w{3,4}"?(?:\\s+[^\\r\\n]+))', 'bare_exe_with_args': '(?:[A-Za-z0-9_\\-]+\\.exe)(?:\\s+[^\\r\\n]+)', 'powershell': '(?:powershell(?:\\.exe)?)\\s+[^\\r\\n]+', 'system32_utils': '(?:"?C:\\\\+Windows\\\\+System32\\\\+(?:net|ipconfig|setspn|quser)\\.exe"?\\s+[^\\r\\n]+)', 'quoted_with_spaces': '"[A-Za-z]:\\\\+[^"\\r\\n]+?\\.\\w{3,4}"(?:\\s+[^\\r\\n]+)'}
Overrides: {'exe_with_args': '"?[A-Za-z]:\\\\\\\\[^"\\r\\n]+\\.\\w{2,5}"?\\s+[^\\r\\n]+', 'bare_exe_with_args': '(?i)(?:^|\\s)([A-Za-z0-9_\\-]+\\.exe)\\s+