# Week 3 Exercise — Synthetic Buggy Code Factory

**Building on the Bug Review Panel (Week 1) and the Bug Exterminator (Week 2)**, this exercise closes the loop by generating the very data those tools consume: realistic buggy Python code samples, produced entirely through LLM prompting.

**Pipeline:** Generate buggy code (Week 3) → Analyze it with personas (Week 1) → Interactively debug it (Week 2)

### What This Does

- Uses the **HuggingFace Inference API** to call multiple models (Qwen Coder, Llama, Mistral) for diverse output
- **Two-phase generation:** first expand a pool of algorithm descriptions, then infest each with realistic bugs
- Covers **easy / medium / hard** difficulty levels and multiple Python error types (SyntaxError, NameError, IndentationError, TypeError, IndexError, LogicError)
- Outputs structured JSON datasets ready for downstream training or evaluation
- Full **Gradio UI** for configuring models, bug types, sample counts, previewing results, and downloading data

In [None]:
# If you get "No module named pip", run: python -m ensurepip --upgrade
%pip install -q huggingface_hub gradio python-dotenv

In [None]:
import os
import re
import json
import time
import random
import tempfile

import gradio as gr
from huggingface_hub import InferenceClient
from dotenv import load_dotenv

In [None]:
load_dotenv(override=True)

hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise EnvironmentError("HF_TOKEN not found in .env — add your HuggingFace token there.")

def get_client(model: str) -> InferenceClient:
    return InferenceClient(model=model, token=hf_token, timeout=120)

In [None]:
MODELS = {
    "Qwen Coder 2.5 7B": "Qwen/Qwen2.5-Coder-7B-Instruct",
    "Llama 3.1 8B": "meta-llama/Llama-3.1-8B-Instruct",
    "Mistral 7B v0.3": "mistralai/Mistral-7B-Instruct-v0.3",
}

LEVELS = ["easy", "medium", "hard"]

BUG_TYPES = [
    "IndentationError",
    "SyntaxError",
    "TypeError",
    "NameError",
    "IndexError",
    "LogicError",
]

SEED_DESCRIPTIONS = {
    "easy": [
        "counts the number of vowels in a string",
        "counts the number of words in a string",
        "counts the number of characters in a string",
        "counts the number of sentences in a string",
        "counts the number of paragraphs in a string",
        "finds the maximum value in an array of numbers",
        "finds the minimum value in an array",
        "reverses a string",
        "checks if a string is a palindrome",
        "calculates the sum of all elements in a list",
    ],
    "medium": [
        "sorts an array of numbers using bubble sort algorithm",
        "sorts an array using merge sort algorithm",
        "sorts an array using quick sort algorithm",
        "searches for a target value in an array using linear search algorithm",
        "searches for a target value in a sorted array using binary search algorithm",
        "searches for a target value in an array using interpolation search algorithm",
        "implements a stack data structure with push, pop and peek",
        "implements a queue data structure with enqueue and dequeue",
        "calculates the nth Fibonacci number recursively",
        "removes duplicates from a sorted linked list",
    ],
    "hard": [
        "calculates the factorial of a number",
        "checks if a number is prime",
        "generates all possible permutations of a string",
        "finds the longest common subsequence of two strings",
        "solves the traveling salesman problem using a genetic algorithm",
        "implements Dijkstra's shortest path algorithm",
        "solves the knapsack problem using dynamic programming",
        "builds a trie data structure for prefix searching",
    ],
}

In [None]:
DESCRIPTION_PROMPT = """\
You are a Python programming curriculum designer.

Generate exactly {count} unique, short descriptions of Python functions at the **{level}** difficulty level.
Each description must start with a lowercase verb and be one sentence (e.g., "sorts a list using insertion sort").

Difficulty guidelines:
- easy: simple loops, string operations, basic math, list traversals
- medium: classic sorting/searching algorithms, recursion, basic data structures
- hard: dynamic programming, graph algorithms, complex recursion, advanced data structures

Here are some existing descriptions to avoid duplicating:
{existing}

Respond with ONLY a JSON array of strings — no markdown fences, no explanation.
Example: ["reverses a linked list", "finds the mode of a list of numbers"]
"""

BUGGY_CODE_PROMPT = """\
You are a Python instructor creating buggy code samples for students to debug.

Write a single Python function that {description}.
Then inject exactly {num_bugs} bug(s) into it — choose from these error types: {bug_types}.

Rules:
- The function should be 5-25 lines and look like a real programmer's mistake.
- Use realistic variable names; occasional typos in names count as NameError.
- Do NOT add any comments explaining or hinting at the bugs.
- Missing colons, wrong indentation, off-by-one errors, misspelled keywords — all fair game.
- The code must be broken enough to fail but not so garbled it's unreadable.

Respond with ONLY a valid JSON object — no markdown fences, no extra text:
{{
  "level": "{level}",
  "description": "{description}",
  "buggy_code": "<the buggy code as a single string with \\n for newlines>",
  "bug_types": ["<actual error types you injected>"],
  "num_bugs": {num_bugs}
}}
"""

In [None]:
def parse_llm_json(text: str):
    """Extract and parse JSON from LLM output, handling markdown fences and stray text."""
    text = text.strip()
    fence = re.search(r"```(?:json)?\s*\n?([\s\S]*?)```", text)
    if fence:
        text = fence.group(1).strip()
    for start_char, end_char in [("{", "}"), ("[", "]")]:
        first = text.find(start_char)
        last = text.rfind(end_char)
        if first != -1 and last != -1 and last > first:
            candidate = text[first : last + 1]
            try:
                return json.loads(candidate)
            except json.JSONDecodeError:
                continue
    return json.loads(text)


_verbose = False
_debug_log: list[str] = []


def _log(msg: str):
    if _verbose:
        _debug_log.append(msg)


def call_llm(client: InferenceClient, prompt: str, max_tokens: int = 1024) -> str:
    """Single LLM call with retry — backs off longer on timeouts (cold-start)."""
    _log(f"── PROMPT ({len(prompt)} chars) ──\n{prompt[:300]}{'...' if len(prompt) > 300 else ''}\n")
    for attempt in range(4):
        try:
            response = client.chat_completion(
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                temperature=0.8,
            )
            result = response.choices[0].message.content
            _log(f"── RESPONSE ({len(result)} chars) ──\n{result[:400]}{'...' if len(result) > 400 else ''}\n")
            return result
        except Exception as exc:
            is_timeout = "timeout" in str(exc).lower() or "timed out" in str(exc).lower()
            if attempt == 3:
                raise
            wait = (10 if is_timeout else 2) * (2 ** attempt)
            _log(f"  RETRY {attempt+1} — {str(exc)[:120]}... waiting {wait}s")
            time.sleep(wait)
    return ""


def generate_descriptions(client: InferenceClient, level: str, count: int, existing: list[str]) -> list[str]:
    """Use the LLM to generate new algorithm descriptions for a given level."""
    prompt = DESCRIPTION_PROMPT.format(
        count=count,
        level=level,
        existing=json.dumps(existing[:10]),
    )
    raw = call_llm(client, prompt)
    try:
        descriptions = parse_llm_json(raw)
        if isinstance(descriptions, list):
            result = [d for d in descriptions if isinstance(d, str) and d not in existing]
            _log(f"  PARSED {len(result)} new descriptions")
            return result
    except (json.JSONDecodeError, ValueError) as e:
        _log(f"  PARSE FAILED (descriptions): {e}")
    return []


def generate_buggy_sample(
    client: InferenceClient,
    model_id: str,
    level: str,
    description: str,
    bug_types: list[str],
    num_bugs: int = 2,
) -> dict | None:
    """Generate a single buggy code sample for the given description."""
    chosen_bugs = random.sample(bug_types, min(num_bugs, len(bug_types)))
    _log(f"  BUGS CHOSEN: {chosen_bugs}")
    prompt = BUGGY_CODE_PROMPT.format(
        description=description,
        num_bugs=num_bugs,
        bug_types=", ".join(chosen_bugs),
        level=level,
    )
    raw = call_llm(client, prompt)
    try:
        sample = parse_llm_json(raw)
        if isinstance(sample, dict) and "buggy_code" in sample:
            sample["model"] = model_id
            sample.setdefault("level", level)
            sample.setdefault("description", description)
            _log(f"  PARSED OK — {sample.get('num_bugs', '?')} bug(s), {len(sample.get('buggy_code', ''))} chars")
            return sample
        _log(f"  PARSE WARNING — JSON valid but missing 'buggy_code' key")
    except (json.JSONDecodeError, ValueError) as e:
        _log(f"  PARSE FAILED (sample): {e}")
    return None

In [None]:
def generate_dataset(
    model_id: str,
    bug_types: list[str],
    counts: dict[str, int],
    expand_descriptions: bool,
    progress_cb=None,
) -> tuple[list[dict], str]:
    """
    Orchestrate the full generation pipeline.
    Returns (samples_list, status_log).
    progress_cb(fraction, description) is called to update a progress bar.
    """
    client = get_client(model_id)
    samples = []
    log_lines = []
    total = sum(counts.values())
    done = 0

    def report(msg):
        log_lines.append(msg)
        frac = done / max(total, 1)
        if progress_cb:
            progress_cb(frac, msg)

    for level in LEVELS:
        target = counts.get(level, 0)
        if target == 0:
            continue

        pool = list(SEED_DESCRIPTIONS.get(level, []))

        if expand_descriptions and target > len(pool):
            needed = target - len(pool)
            report(f"Expanding {level} descriptions (+{needed})...")
            extras = generate_descriptions(client, level, needed, pool)
            pool.extend(extras)
            report(f"  Got {len(extras)} new descriptions")
            time.sleep(1)

        random.shuffle(pool)
        descs_to_use = pool[:target]

        if len(descs_to_use) < target:
            descs_to_use = descs_to_use * ((target // len(descs_to_use)) + 1)
            descs_to_use = descs_to_use[:target]

        for i, desc in enumerate(descs_to_use):
            report(f"[{level}] {i+1}/{target}: {desc[:60]}...")
            num_bugs = random.randint(1, 3)
            sample = generate_buggy_sample(client, model_id, level, desc, bug_types, num_bugs)
            if sample:
                samples.append(sample)
                report(f"  OK — {sample.get('num_bugs', '?')} bug(s)")
            else:
                report(f"  SKIP — failed to parse LLM output")
            done += 1
            time.sleep(0.5)

    stats = {}
    for s in samples:
        lvl = s.get("level", "?")
        stats[lvl] = stats.get(lvl, 0) + 1
    summary = f"Done: {len(samples)}/{total} samples — " + ", ".join(f"{k}: {v}" for k, v in stats.items())
    log_lines.append(summary)

    return samples, "\n".join(log_lines)

In [None]:
generated_samples: list[dict] = []

DETAIL_PLACEHOLDER = "*Click a row in the table above to inspect the full sample.*"


def format_sample_markdown(sample: dict) -> str:
    level = sample.get("level", "unknown")
    desc = sample.get("description", "")
    code = sample.get("buggy_code", "")
    bugs = sample.get("bug_types", [])
    if isinstance(bugs, list):
        bugs = ", ".join(bugs)
    num = sample.get("num_bugs", "?")
    model = sample.get("model", "")

    return (
        f"### {level.upper()} — {desc}\n\n"
        f"**Bug types:** {bugs} &nbsp;|&nbsp; **Count:** {num} &nbsp;|&nbsp; **Model:** `{model}`\n\n"
        f"```python\n{code}\n```"
    )


def on_generate(model_name, bug_types, easy_n, med_n, hard_n, expand, verbose, progress=gr.Progress()):
    global generated_samples, _verbose, _debug_log
    _verbose = verbose
    _debug_log = []

    if not bug_types:
        gr.Warning("Select at least one bug type.")
        return gr.update(), None, gr.update(interactive=True), DETAIL_PLACEHOLDER, ""

    model_id = MODELS[model_name]
    counts = {"easy": int(easy_n), "medium": int(med_n), "hard": int(hard_n)}
    _log(f"CONFIG — model={model_id}, counts={counts}, expand={expand}, bugs={bug_types}")

    def progress_cb(frac, msg):
        progress(frac, desc=msg)

    samples, _ = generate_dataset(model_id, bug_types, counts, expand, progress_cb)
    generated_samples = samples

    _log(f"\nFINISHED — {len(samples)} samples generated")
    log_text = "\n".join(_debug_log) if _verbose else ""

    if not samples:
        return gr.update(value=None), None, gr.update(interactive=True), DETAIL_PLACEHOLDER, log_text

    rows = []
    for s in samples:
        code_preview = s.get("buggy_code", "")[:80].replace("\n", "↵") + "..."
        bug_types_str = ", ".join(s.get("bug_types", [])) if isinstance(s.get("bug_types"), list) else str(s.get("bug_types", ""))
        rows.append([
            s.get("level", ""),
            s.get("description", "")[:60],
            code_preview,
            bug_types_str,
            str(s.get("num_bugs", "?")),
            s.get("model", "").split("/")[-1],
        ])

    tmp = tempfile.NamedTemporaryFile(
        mode="w", suffix=".json", delete=False, prefix="buggy_dataset_"
    )
    json.dump(samples, tmp, indent=2)
    tmp.close()

    first_detail = format_sample_markdown(samples[0])

    return (
        gr.update(value=rows),
        tmp.name,
        gr.update(interactive=True),
        first_detail,
        log_text,
    )


_selected_idx = -1

VALIDATE_PROMPT = """\
Analyze the following Python code. Identify ALL errors — syntax errors, \
logic errors, naming issues, indentation problems, or anything else wrong.

For each error found, state:
1. The line or area where it occurs
2. What type of error it is
3. A brief explanation

Then give a corrected version of the full code.

```python
{code}
```
"""

VALIDATE_PLACEHOLDER = "*Select a row and click **Validate** to independently check the code for errors.*"


def on_row_select(evt: gr.SelectData):
    global _selected_idx
    idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
    _selected_idx = idx
    if 0 <= idx < len(generated_samples):
        return format_sample_markdown(generated_samples[idx]), VALIDATE_PLACEHOLDER
    return DETAIL_PLACEHOLDER, VALIDATE_PLACEHOLDER


def on_validate(model_name):
    if _selected_idx < 0 or _selected_idx >= len(generated_samples):
        gr.Warning("Select a row first.")
        return VALIDATE_PLACEHOLDER
    sample = generated_samples[_selected_idx]
    code = sample.get("buggy_code", "")
    if not code.strip():
        return "No code to validate."

    model_id = MODELS[model_name]
    client = get_client(model_id)
    prompt = VALIDATE_PROMPT.format(code=code)
    try:
        result = call_llm(client, prompt, max_tokens=1500)
        return f"### Independent Validation\n\n{result}"
    except Exception as e:
        return f"**Validation failed:** {e}"


def on_download_jsonl():
    global generated_samples
    if not generated_samples:
        gr.Warning("Generate a dataset first.")
        return None
    tmp = tempfile.NamedTemporaryFile(
        mode="w", suffix=".jsonl", delete=False, prefix="buggy_dataset_"
    )
    for s in generated_samples:
        tmp.write(json.dumps(s) + "\n")
    tmp.close()
    return tmp.name


with gr.Blocks(theme=gr.themes.Soft(), title="Buggy Code Factory") as app:
    gr.Markdown(
        "## Synthetic Buggy Code Factory\n"
        "Configure the generation parameters below, pick a model, select bug types, "
        "set how many samples per difficulty level, and hit **Generate**."
    )

    with gr.Row():
        with gr.Column(scale=1):
            model_dd = gr.Dropdown(
                choices=list(MODELS.keys()),
                value=list(MODELS.keys())[0],
                label="Model",
            )
            bug_cb = gr.CheckboxGroup(
                choices=BUG_TYPES,
                value=BUG_TYPES[:3],
                label="Bug Types to Inject",
            )
            expand_ck = gr.Checkbox(
                value=True,
                label="Expand description pool via LLM",
            )

        with gr.Column(scale=1):
            easy_sl = gr.Slider(0, 20, value=3, step=1, label="Easy samples")
            med_sl = gr.Slider(0, 20, value=3, step=1, label="Medium samples")
            hard_sl = gr.Slider(0, 20, value=2, step=1, label="Hard samples")
            verbose_ck = gr.Checkbox(value=False, label="Show debug logs (prompts & responses)")

    gen_btn = gr.Button("Generate Dataset", variant="primary")

    preview_df = gr.DataFrame(
        label="Click a row to inspect",
        headers=["Level", "Description", "Code Preview", "Bug Types", "Bugs", "Model"],
        interactive=False,
    )
    detail_md = gr.Markdown(DETAIL_PLACEHOLDER)
    validate_btn = gr.Button("Validate Selected Code", variant="secondary")
    validate_md = gr.Markdown(VALIDATE_PLACEHOLDER)

    with gr.Accordion("Debug Logs", open=False, visible=True):
        debug_box = gr.Textbox(lines=12, max_lines=12, interactive=False, show_label=False, placeholder="Enable the debug toggle and generate to see logs here.")

    with gr.Row():
        json_file = gr.File(label="Download JSON", interactive=False)
        jsonl_btn = gr.Button("Export as JSONL")
        jsonl_file = gr.File(label="Download JSONL", interactive=False)

    gen_btn.click(
        fn=lambda: gr.update(interactive=False),
        outputs=[gen_btn],
    ).then(
        fn=on_generate,
        inputs=[model_dd, bug_cb, easy_sl, med_sl, hard_sl, expand_ck, verbose_ck],
        outputs=[preview_df, json_file, gen_btn, detail_md, debug_box],
    )

    preview_df.select(fn=on_row_select, outputs=[detail_md, validate_md])
    validate_btn.click(fn=on_validate, inputs=[model_dd], outputs=[validate_md])
    jsonl_btn.click(fn=on_download_jsonl, outputs=[jsonl_file])

In [None]:
app.launch()