# Week 4 Exercise — Bug Benchmark Arena

**Building on the full pipeline** — Generate buggy code (Week 3) → Analyze with personas (Week 1) → Interactively debug (Week 2) — this exercise asks: **which models are actually best at injecting and detecting bugs?**

### What This Does

- **Benchmarks coding-focused LLMs** head-to-head on two tasks: **infesting** clean code with realistic bugs, and **detecting** those bugs in infested code
- Uses **OpenRouter** to access frontier and open-weight coding models (Qwen Coder, Claude, Gemini, DeepSeek, GPT) through a single API
- **Dual-panel Gradio UI**: left side for bug infesting, right side for independent validation — each with its own model selector
- Configurable **bug type checkboxes** (SyntaxError, NameError, IndentationError, TypeError, IndexError, LogicError)
- Structured **JSON output** from validation showing exactly which errors were found and whether they match what was injected
- Enables direct comparison: which model is the best "bug injector" and which is the best "bug detector"

In [None]:
# If you get "No module named pip", run: python -m ensurepip --upgrade
%pip install -q openai gradio python-dotenv

In [None]:
import os
import re
import json
import time

import gradio as gr
from openai import OpenAI
from dotenv import load_dotenv

In [None]:
load_dotenv(override=True)

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise EnvironmentError("OPENAI_API_KEY not found in .env — add your OpenRouter key there.")

OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
client = OpenAI(base_url=OPENROUTER_BASE_URL, api_key=api_key)

In [None]:
MODELS = {
    "Qwen 2.5 Coder 32B": "qwen/qwen-2.5-coder-32b-instruct",
    "DeepSeek Chat V3": "deepseek/deepseek-chat",
    "GPT-4o Mini": "openai/gpt-4o-mini",
    "Gemini 2.5 Flash": "google/gemini-2.5-flash",
    "Claude 3.5 Haiku": "anthropic/claude-3.5-haiku",
    "Llama 3.1 70B": "meta-llama/llama-3.1-70b-instruct",
}

BUG_TYPES = [
    "SyntaxError",
    "IndentationError",
    "NameError",
    "TypeError",
    "IndexError",
    "LogicError",
]

CLEAN_SAMPLES = {
    "Binary Search": (
        "def binary_search(arr, target):\n"
        "    low, high = 0, len(arr) - 1\n"
        "    while low <= high:\n"
        "        mid = (low + high) // 2\n"
        "        if arr[mid] == target:\n"
        "            return mid\n"
        "        elif arr[mid] < target:\n"
        "            low = mid + 1\n"
        "        else:\n"
        "            high = mid - 1\n"
        "    return -1"
    ),
    "Bubble Sort": (
        "def bubble_sort(arr):\n"
        "    n = len(arr)\n"
        "    for i in range(n):\n"
        "        for j in range(n - i - 1):\n"
        "            if arr[j] > arr[j + 1]:\n"
        "                arr[j], arr[j + 1] = arr[j + 1], arr[j]\n"
        "    return arr"
    ),
    "Fibonacci": (
        "def fibonacci(n):\n"
        "    if n <= 0:\n"
        "        return 0\n"
        "    elif n == 1:\n"
        "        return 1\n"
        "    a, b = 0, 1\n"
        "    for _ in range(2, n + 1):\n"
        "        a, b = b, a + b\n"
        "    return b"
    ),
    "Matrix Multiply": (
        "def matrix_multiply(a, b):\n"
        "    rows_a, cols_a = len(a), len(a[0])\n"
        "    rows_b, cols_b = len(b), len(b[0])\n"
        "    if cols_a != rows_b:\n"
        "        raise ValueError('Incompatible dimensions')\n"
        "    result = [[0] * cols_b for _ in range(rows_a)]\n"
        "    for i in range(rows_a):\n"
        "        for j in range(cols_b):\n"
        "            for k in range(cols_a):\n"
        "                result[i][j] += a[i][k] * b[k][j]\n"
        "    return result"
    ),
    "LRU Cache": (
        "class LRUCache:\n"
        "    def __init__(self, capacity):\n"
        "        self.capacity = capacity\n"
        "        self.cache = {}\n"
        "        self.order = []\n"
        "\n"
        "    def get(self, key):\n"
        "        if key in self.cache:\n"
        "            self.order.remove(key)\n"
        "            self.order.append(key)\n"
        "            return self.cache[key]\n"
        "        return -1\n"
        "\n"
        "    def put(self, key, value):\n"
        "        if key in self.cache:\n"
        "            self.order.remove(key)\n"
        "        elif len(self.cache) >= self.capacity:\n"
        "            oldest = self.order.pop(0)\n"
        "            del self.cache[oldest]\n"
        "        self.cache[key] = value\n"
        "        self.order.append(key)"
    ),
}

In [None]:
INFEST_PROMPT = """\
You are an expert Python instructor creating buggy code for students to debug.

Take the following CORRECT Python code and inject exactly {num_bugs} bug(s) into it.
You MUST choose bugs from these types: {bug_types}.

Rules:
- The code should still look like a real programmer's mistake — not obviously garbled.
- Typos in variable names count as NameError. Missing colons count as SyntaxError.
- Wrong indentation counts as IndentationError. Off-by-one or wrong operators count as LogicError.
- Do NOT add comments hinting at the bugs.

CORRECT CODE:
```python
{code}
```

Respond with ONLY a valid JSON object — no markdown fences, no extra text:
{{
  "buggy_code": "<the infested code with \\n for newlines>",
  "injected_bugs": [
    {{"type": "<error type>", "description": "<what you changed>"}}
  ]
}}
"""

VALIDATE_PROMPT = """\
Analyze the following Python code for errors. Find ALL bugs — syntax errors, \
logic errors, naming issues, indentation problems, type errors, index errors, \
or anything else wrong.

```python
{code}
```

Respond with ONLY a valid JSON object — no markdown fences, no extra text:
{{
  "bugs_found": [
    {{"type": "<error type e.g. SyntaxError, NameError, LogicError>", "description": "<what is wrong>", "location": "<line or area>"}}
  ],
  "total_bugs": <number>,
  "is_buggy": true or false
}}
"""

In [None]:
_verbose = False
_debug_log: list[str] = []


def _log(msg: str):
    if _verbose:
        _debug_log.append(msg)


def parse_llm_json(text: str):
    """Extract JSON from LLM output, handling markdown fences and stray text."""
    text = text.strip()
    fence = re.search(r"```(?:json)?\s*\n?([\s\S]*?)```", text)
    if fence:
        text = fence.group(1).strip()
    for start, end in [("{", "}"), ("[", "]")]:
        first = text.find(start)
        last = text.rfind(end)
        if first != -1 and last > first:
            try:
                return json.loads(text[first : last + 1])
            except json.JSONDecodeError:
                continue
    return json.loads(text)


def call_model(model_id: str, prompt: str, max_tokens: int = 1500) -> str:
    """Call an OpenRouter model with retry logic."""
    _log(f"── REQUEST to {model_id} ({len(prompt)} chars) ──\n{prompt[:400]}{'...' if len(prompt) > 400 else ''}\n")
    for attempt in range(3):
        try:
            response = client.chat.completions.create(
                model=model_id,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                temperature=0.7,
            )
            result = response.choices[0].message.content
            _log(f"── RESPONSE ({len(result)} chars) ──\n{result[:500]}{'...' if len(result) > 500 else ''}\n")
            return result
        except Exception as exc:
            if attempt == 2:
                raise
            wait = 3 * (2 ** attempt)
            _log(f"  RETRY {attempt+1} — {str(exc)[:120]}... waiting {wait}s")
            time.sleep(wait)
    return ""


def infest_code(model_id: str, clean_code: str, bug_types: list[str], num_bugs: int = 2) -> dict:
    """Ask a model to inject bugs into clean code. Returns parsed JSON or error."""
    prompt = INFEST_PROMPT.format(
        code=clean_code,
        bug_types=", ".join(bug_types),
        num_bugs=num_bugs,
    )
    raw = call_model(model_id, prompt)
    try:
        result = parse_llm_json(raw)
        if isinstance(result, dict) and "buggy_code" in result:
            result.setdefault("injected_bugs", [])
            return result
    except (json.JSONDecodeError, ValueError):
        pass
    return {"error": "Failed to parse infest response", "raw": raw[:500]}


def validate_code(model_id: str, code: str) -> dict:
    """Ask a model to find bugs in code (zero context). Returns parsed JSON or error."""
    prompt = VALIDATE_PROMPT.format(code=code)
    raw = call_model(model_id, prompt)
    try:
        result = parse_llm_json(raw)
        if isinstance(result, dict):
            result.setdefault("bugs_found", [])
            result.setdefault("total_bugs", len(result["bugs_found"]))
            return result
    except (json.JSONDecodeError, ValueError):
        pass
    return {"error": "Failed to parse validate response", "raw": raw[:500]}

In [None]:
def _freq_map(types: list[str]) -> dict[str, int]:
    freq = {}
    for t in types:
        freq[t] = freq.get(t, 0) + 1
    return freq


def _freq_str(freq: dict[str, int]) -> str:
    if not freq:
        return "none"
    return ", ".join(f"{k} x{v}" for k, v in sorted(freq.items(), key=lambda x: -x[1]))


def compute_match_score(injected: list[dict], found: list[dict]) -> dict:
    """Compare injected bug types against detected bug types."""
    injected_types = [b.get("type", "").strip() for b in injected]
    found_types = [b.get("type", "").strip() for b in found]

    matched = 0
    unmatched_found = list(found_types)
    for t in injected_types:
        if t in unmatched_found:
            matched += 1
            unmatched_found.remove(t)

    precision = matched / len(found_types) if found_types else 0
    recall = matched / len(injected_types) if injected_types else 0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0

    return {
        "injected_freq": _freq_str(_freq_map(injected_types)),
        "found_freq": _freq_str(_freq_map(found_types)),
        "matched": matched,
        "false_positives": len(unmatched_found),
        "missed": len(injected_types) - matched,
        "precision": round(precision, 2),
        "recall": round(recall, 2),
        "f1": round(f1, 2),
    }


def run_benchmark(
    infest_model_id: str,
    validate_model_id: str,
    clean_code: str,
    bug_types: list[str],
    num_bugs: int,
    progress_cb=None,
) -> dict:
    """Run a single infest→validate cycle and return full results."""
    if progress_cb:
        progress_cb(0.1, "Infesting code...")
    infest_result = infest_code(infest_model_id, clean_code, bug_types, num_bugs)
    if "error" in infest_result:
        return {"error": f"Infest failed: {infest_result['error']}"}

    buggy_code = infest_result.get("buggy_code", "")
    injected = infest_result.get("injected_bugs", [])

    if progress_cb:
        progress_cb(0.5, "Validating infested code...")
    validate_result = validate_code(validate_model_id, buggy_code)
    if "error" in validate_result:
        return {"error": f"Validate failed: {validate_result['error']}"}

    found = validate_result.get("bugs_found", [])
    score = compute_match_score(injected, found)

    if progress_cb:
        progress_cb(1.0, "Done")

    return {
        "infest_model": infest_model_id,
        "validate_model": validate_model_id,
        "clean_code": clean_code,
        "buggy_code": buggy_code,
        "injected_bugs": injected,
        "detected_bugs": found,
        "score": score,
    }

In [None]:
last_result: dict = {}

def format_infest_md(result: dict) -> str:
    if "error" in result:
        return f"**Error:** {result['error']}"
    code = result.get("buggy_code", "")
    bugs = result.get("injected_bugs", [])
    lines = [f"### Infested Code\n\n```python\n{code}\n```\n", "### Injected Bugs\n"]
    for b in bugs:
        lines.append(f"- **{b.get('type', '?')}**: {b.get('description', '')}")
    return "\n".join(lines)


def format_validate_md(result: dict) -> str:
    if "error" in result:
        return f"**Error:** {result['error']}"
    found = result.get("detected_bugs", [])
    score = result.get("score", {})
    lines = ["### Detected Bugs\n"]
    for b in found:
        lines.append(f"- **{b.get('type', '?')}** at `{b.get('location', '?')}`: {b.get('description', '')}")
    lines.append(f"\n### Match Score\n")
    lines.append(f"| Metric | Value |")
    lines.append(f"|--------|-------|")
    lines.append(f"| Injected | {score.get('injected_freq', 'none')} |")
    lines.append(f"| Found | {score.get('found_freq', 'none')} |")
    lines.append(f"| Matched | {score.get('matched', 0)} |")
    lines.append(f"| Missed | {score.get('missed', 0)} |")
    lines.append(f"| False Positives | {score.get('false_positives', 0)} |")
    lines.append(f"| **Precision** | **{score.get('precision', 0)}** |")
    lines.append(f"| **Recall** | **{score.get('recall', 0)}** |")
    lines.append(f"| **F1 Score** | **{score.get('f1', 0)}** |")
    return "\n".join(lines)


def on_run(infest_model, validate_model, sample_name, bug_types, num_bugs, verbose, progress=gr.Progress()):
    global last_result, _verbose, _debug_log
    _verbose = verbose
    _debug_log = []

    if not bug_types:
        gr.Warning("Select at least one bug type.")
        return "Select bug types.", "Select bug types.", "{}", ""

    clean_code = CLEAN_SAMPLES.get(sample_name, "")
    if not clean_code:
        return "No sample selected.", "No sample selected.", "{}", ""

    infest_id = MODELS[infest_model]
    validate_id = MODELS[validate_model]
    _log(f"CONFIG — infest={infest_id}, validate={validate_id}, bugs={bug_types}, n={int(num_bugs)}")

    def progress_cb(frac, msg):
        progress(frac, desc=msg)

    result = run_benchmark(infest_id, validate_id, clean_code, bug_types, int(num_bugs), progress_cb)
    last_result = result

    _log(f"\nBENCHMARK COMPLETE")
    log_text = "\n".join(_debug_log) if _verbose else ""

    if "error" in result:
        err = f"**Error:** {result['error']}"
        return err, err, json.dumps(result, indent=2), log_text

    infest_md = format_infest_md(result)
    validate_md = format_validate_md(result)
    return infest_md, validate_md, json.dumps(result, indent=2), log_text


def on_sample_select(sample_name):
    return CLEAN_SAMPLES.get(sample_name, "")


model_names = list(MODELS.keys())

with gr.Blocks(theme=gr.themes.Soft(), title="Bug Benchmark Arena") as app:
    gr.Markdown(
        "## Bug Benchmark Arena\n"
        "Pick an **Infest model** and a **Validate model**, select a clean code sample and bug types, "
        "then hit **Run Benchmark** to see how well each model performs."
    )

    with gr.Row():
        sample_dd = gr.Dropdown(choices=list(CLEAN_SAMPLES.keys()), value="Binary Search", label="Code Sample")
        num_bugs_sl = gr.Slider(1, 5, value=2, step=1, label="Bugs to inject")

    code_preview = gr.Code(value=CLEAN_SAMPLES["Binary Search"], language="python", label="Clean Code", interactive=False)
    bug_cb = gr.CheckboxGroup(choices=BUG_TYPES, value=BUG_TYPES[:3], label="Bug Types")

    with gr.Row():
        infest_dd = gr.Dropdown(choices=model_names, value=model_names[0], label="Infest Model")
        validate_dd = gr.Dropdown(choices=model_names, value=model_names[1], label="Validate Model")

    verbose_ck = gr.Checkbox(value=False, label="Show debug logs (prompts & responses)")

    run_btn = gr.Button("Run Benchmark", variant="primary")

    with gr.Row():
        with gr.Column():
            gr.Markdown("#### Infest Results")
            infest_output = gr.Markdown("*Run a benchmark to see infest results.*")
        with gr.Column():
            gr.Markdown("#### Validate Results")
            validate_output = gr.Markdown("*Run a benchmark to see validation results.*")

    with gr.Accordion("Raw JSON Output", open=False):
        json_output = gr.Textbox(lines=15, max_lines=15, interactive=False, show_label=False)

    with gr.Accordion("Debug Logs", open=False):
        debug_box = gr.Textbox(lines=12, max_lines=12, interactive=False, show_label=False, placeholder="Enable the debug toggle and run a benchmark to see logs.")

    sample_dd.change(fn=on_sample_select, inputs=[sample_dd], outputs=[code_preview])

    run_btn.click(
        fn=lambda: gr.update(interactive=False),
        outputs=[run_btn],
    ).then(
        fn=on_run,
        inputs=[infest_dd, validate_dd, sample_dd, bug_cb, num_bugs_sl, verbose_ck],
        outputs=[infest_output, validate_output, json_output, debug_box],
    ).then(
        fn=lambda: gr.update(interactive=True),
        outputs=[run_btn],
    )

In [None]:
app.launch()