# Week 4 Challenge

Using different open source models to:
- Detect bugs and security vulnerabilities
- Suggest code improvements
- Generate unit tests
- Add docstrings and comments
- Compare results across multiple models

**Models used** (via OpenRouter): Qwen2.5-Coder 32B, Qwen3 Coder, DeepSeek R1, GPT-OSS-20B

## 1. Setup & Configuration

In [1]:
import os
import json
import re
import logging
from typing import Any
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

In [None]:
load_dotenv(override=True)

openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
openrouter_url = "https://openrouter.ai/api/v1"

openrouter_client: OpenAI | None = None
if openrouter_api_key:
    openrouter_client = OpenAI(api_key=openrouter_api_key, base_url=openrouter_url)
    print(f"OpenRouter connected (key: {openrouter_api_key[:6]}...)")
else:
    print("OPENROUTER_API_KEY not set â€” add it to your .env file")

In [None]:
# ---------------------------------------------------------------------------
# Model registry â€” add or remove models here without touching any other code
# ---------------------------------------------------------------------------
MODELS: dict[str, dict[str, Any]] = {}

if openrouter_client:
    _openrouter_models = [
        {
            "key": "qwen-2.5-coder-32b",
            "name": "Qwen2.5-Coder 32B",
            "model": "qwen/qwen-2.5-coder-32b-instruct",
            "description": "Alibaba's top open-source code model",
        },
        {
            "key": "qwen3-coder",
            "name": "Qwen3 Coder 480B A35B",
            "model": "qwen/qwen3-coder",
            "description": "Latest Qwen3 coding model",
        },
        {
            "key": "deepseek-r1",
            "name": "DeepSeek R1",
            "model": "deepseek/deepseek-r1",
            "description": "DeepSeek reasoning model",
        },
        {
            "key": "gpt-oss-20b",
            "name": "GPT-OSS 20B",
            "model": "openai/gpt-oss-20b",
            "description": "OpenAI open-source 20B model",
        },
    ]
    for m in _openrouter_models:
        MODELS[m["key"]] = {**m, "client": openrouter_client}

if MODELS:
    print(f"{len(MODELS)} model(s) ready:")
    for key, cfg in MODELS.items():
        print(f"   â€¢ {cfg['name']} ({key})")
else:
    print("No models configured. Ensure OPENROUTER_API_KEY is set.")

## 2. System Prompts

In [9]:
BUG_DETECTION_SYSTEM_PROMPT = """You are an expert code reviewer specializing in bugs, security vulnerabilities, and logic errors.

Analyze the provided Python code and return ONLY a valid JSON array. No preamble, no explanation, no markdown fences.

Each element must follow this schema exactly:
[
  {
    "severity": "critical|high|medium|low",
    "line": <integer or null>,
    "issue": "concise description of the problem",
    "suggestion": "actionable fix"
  }
]

Severity definitions:
- critical: security vulnerabilities, data loss, crashes
- high: logic bugs that produce wrong results
- medium: edge-case failures, missing error handling
- low: code smell, minor style issues

Return [] if no issues are found."""


DOCSTRING_SYSTEM_PROMPT = """You are a senior Python engineer specialising in clear, professional documentation.

Task: enhance the given Python code by adding Google-style docstrings and inline comments.

Rules:
- Add module, class, and function docstrings following PEP 257
- Use Google-style (Args / Returns / Raises sections)
- Write inline comments only for non-obvious logic â€” never for obvious lines
- Preserve ALL original logic unchanged
- Return ONLY the updated Python code â€” no markdown fences, no explanations"""


IMPROVEMENTS_SYSTEM_PROMPT = """You are a senior software engineer focused on code quality.

Analyse the Python code and suggest improvements. Return ONLY a valid JSON array. No preamble, no markdown.

Schema:
[
  {
    "category": "readability|performance|style|error_handling|security",
    "line": <integer or null>,
    "current": "current code snippet (keep short)",
    "improved": "improved code snippet (keep short)",
    "explanation": "why this change is beneficial"
  }
]

Only include meaningful, actionable improvements. Return [] if the code is already well-written."""


TEST_GENERATION_SYSTEM_PROMPT = """You are an expert in writing comprehensive pytest test suites.

Generate pytest unit tests for the given Python code. Requirements:
- Cover happy-path, edge cases, and error conditions
- Write tests for any identified bugs
- Use descriptive test function names (test_<function>_<scenario>)
- Add a brief docstring to each test explaining what it checks
- Group tests in a class per function under test
- Include all required imports at the top

Return ONLY valid Python code. No explanations, no markdown fences."""

## 3. Core API Layer

In [10]:
def _call_model(
    model_key: str,
    system_prompt: str,
    user_prompt: str,
    temperature: float = 0.1,
    max_tokens: int = 4096,
) -> dict[str, Any]:
    """Low-level wrapper around the chat completion API.

    Args:
        model_key: Key into the MODELS registry.
        system_prompt: Instruction prompt for the model.
        user_prompt: The user-facing input.
        temperature: Sampling temperature (lower = more deterministic).
        max_tokens: Maximum tokens in the response.

    Returns:
        dict with keys ``success`` (bool), ``model`` (str),
        ``content`` (str | None), and ``error`` (str | None).
    """
    if model_key not in MODELS:
        return {"success": False, "model": model_key, "content": None,
                "error": f"Unknown model key: '{model_key}'"}

    cfg = MODELS[model_key]
    try:
        response = cfg["client"].chat.completions.create(
            model=cfg["model"],
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=temperature,
            max_tokens=max_tokens,
        )
        return {
            "success": True,
            "model": cfg["name"],
            "content": response.choices[0].message.content,
            "error": None,
        }
    except Exception as exc:
        logger.error("Model call failed for '%s': %s", model_key, exc)
        return {"success": False, "model": cfg["name"], "content": None, "error": str(exc)}


def _extract_json(text: str) -> list[dict]:
    """Robustly extract a JSON array from model output.

    Handles optional markdown code fences and leading/trailing prose.

    Args:
        text: Raw model response text.

    Returns:
        Parsed list of dicts, or an empty list on failure.
    """
    # Strip markdown fences
    text = re.sub(r"```(?:json)?\n?", "", text).strip()

    # Try greedy JSON array match first, then full-text parse
    for candidate in (re.search(r"\[\s*\{.*\}\s*\]", text, re.DOTALL), None):
        snippet = candidate.group() if candidate else text
        try:
            parsed = json.loads(snippet)
            if isinstance(parsed, list):
                return parsed
        except json.JSONDecodeError:
            continue

    logger.warning("Could not parse JSON from model response; returning empty list.")
    return []

## 4. Analysis Functions

In [11]:
def detect_bugs(code: str, model_key: str) -> dict[str, Any]:
    """Detect bugs and security issues in Python code.

    Args:
        code: Python source code to analyse.
        model_key: Key of the model to use from MODELS.

    Returns:
        dict with ``model``, ``issues`` (list), ``success`` (bool),
        and optionally ``error`` (str).
    """
    result = _call_model(
        model_key,
        BUG_DETECTION_SYSTEM_PROMPT,
        f"Analyse this Python code for bugs and security issues:\n\n```python\n{code}\n```",
        temperature=0.1,
    )
    issues = _extract_json(result["content"]) if result["success"] else []
    return {**result, "issues": issues}


def add_docstrings(code: str, model_key: str) -> dict[str, Any]:
    """Add Google-style docstrings and inline comments to Python code.

    Args:
        code: Python source code to document.
        model_key: Key of the model to use from MODELS.

    Returns:
        dict with ``model``, ``documented_code`` (str), ``success`` (bool),
        and optionally ``error`` (str).
    """
    result = _call_model(
        model_key,
        DOCSTRING_SYSTEM_PROMPT,
        f"Add proper docstrings and inline comments to this Python code:\n\n```python\n{code}\n```",
        temperature=0.1,
    )
    # Strip any residual markdown fences the model may have added
    documented = re.sub(r"```(?:python)?\n?", "", result.get("content") or "").strip()
    return {**result, "documented_code": documented}


def suggest_improvements(code: str, model_key: str) -> dict[str, Any]:
    """Suggest readability, performance, and best-practice improvements.

    Args:
        code: Python source code to improve.
        model_key: Key of the model to use from MODELS.

    Returns:
        dict with ``model``, ``improvements`` (list), ``success`` (bool),
        and optionally ``error`` (str).
    """
    result = _call_model(
        model_key,
        IMPROVEMENTS_SYSTEM_PROMPT,
        f"Suggest improvements for this Python code:\n\n```python\n{code}\n```",
        temperature=0.2,
    )
    improvements = _extract_json(result["content"]) if result["success"] else []
    return {**result, "improvements": improvements}


def generate_tests(code: str, bugs: list[dict], model_key: str) -> dict[str, Any]:
    """Generate a pytest test suite for the given code.

    Args:
        code: Python source code to test.
        bugs: List of bug dicts (from detect_bugs) to include regression tests for.
        model_key: Key of the model to use from MODELS.

    Returns:
        dict with ``model``, ``test_code`` (str), ``success`` (bool),
        and optionally ``error`` (str).
    """
    bugs_section = ""
    if bugs:
        bug_lines = "\n".join(
            f"  - Line {b.get('line', '?')} [{b.get('severity', '?').upper()}]: {b.get('issue', '')}"
            for b in bugs
        )
        bugs_section = f"\n\nKnown bugs to cover with regression tests:\n{bug_lines}"

    result = _call_model(
        model_key,
        TEST_GENERATION_SYSTEM_PROMPT,
        f"Generate pytest tests for this Python code:{bugs_section}\n\n```python\n{code}\n```",
        temperature=0.3,
        max_tokens=6000,
    )
    test_code = re.sub(r"```(?:python)?\n?", "", result.get("content") or "").strip()
    return {**result, "test_code": test_code}

## 5. Formatting Helpers

In [12]:
_SEVERITY_EMOJI = {"CRITICAL": "ðŸ”´", "HIGH": "ðŸŸ ", "MEDIUM": "ðŸŸ¡", "LOW": "ðŸ”µ"}
_SEVERITY_ORDER = {"critical": 0, "high": 1, "medium": 2, "low": 3}


def _error_msg(result: dict) -> str:
    """Return a formatted error string for a failed model call."""
    return f"**Error ({result.get('model', 'Unknown')}):** {result.get('error', 'Unknown error')}"


def format_bugs_output(result: dict) -> str:
    """Render bug-detection results as Markdown."""
    if not result.get("success"):
        return _error_msg(result)

    issues = result.get("issues", [])
    if not issues:
        return f"**{result['model']}**: No issues found â€” code looks clean!"

    sorted_issues = sorted(issues, key=lambda x: _SEVERITY_ORDER.get(x.get("severity", "low"), 3))
    lines = [f"### {result['model']} â€” {len(issues)} issue(s) found\n"]

    for issue in sorted_issues:
        sev = issue.get("severity", "unknown").upper()
        emoji = _SEVERITY_EMOJI.get(sev, "âšª")
        line_ref = f"Line {issue['line']}" if issue.get("line") else "General"
        lines.append(f"{emoji} **{sev}** ({line_ref}): {issue.get('issue', '')}")
        if fix := issue.get("suggestion"):
            lines.append(f"*Fix:* {fix}")
        lines.append("")

    return "\n".join(lines)


def format_docstring_output(result: dict) -> str:
    """Return the documented code (plain Python string for gr.Code)."""
    if not result.get("success"):
        return _error_msg(result)
    return result.get("documented_code") or f"**{result.get('model', '?')}**: No output generated."


def format_improvements_output(result: dict) -> str:
    """Render improvement suggestions as Markdown."""
    if not result.get("success"):
        return _error_msg(result)

    improvements = result.get("improvements", [])
    if not improvements:
        return f"**{result['model']}**: Code follows best practices â€” no major changes needed."

    lines = [f"### {result['model']} â€” {len(improvements)} suggestion(s)\n"]
    for imp in improvements:
        category = imp.get("category", "general").replace("_", " ").title()
        line_ref = f"Line {imp['line']}" if imp.get("line") else "General"
        lines.append(f"\n**{category}** ({line_ref})")
        current = imp.get("current", "")
        improved = imp.get("improved", "")
        if current and improved:
            lines.append(f"  - Before: `{current[:80]}{'â€¦' if len(current) > 80 else ''}`")
            lines.append(f"  - After:  `{improved[:80]}{'â€¦' if len(improved) > 80 else ''}`")
        if explanation := imp.get("explanation"):
            lines.append(f"{explanation}")

    return "\n".join(lines)


def format_tests_output(result: dict) -> str:
    """Return generated test code (plain string for gr.Code)."""
    if not result.get("success"):
        return _error_msg(result)
    return result.get("test_code") or f"**{result.get('model', '?')}**: No tests generated."

## 6. Orchestration

In [None]:
def review_code(
    code: str,
    model_key: str,
    include_tests: bool = True,
) -> tuple[str, str, str, str]:
    """Run a complete code review: bugs â†’ improvements â†’ docstrings â†’ tests.

    Args:
        code: Python source code to review.
        model_key: Model registry key to use for all steps.
        include_tests: Whether to generate unit tests.

    Returns:
        4-tuple of (bugs_md, improvements_md, test_code, documented_code).
    """
    if not code.strip():
        empty = "Please provide some Python code to review."
        return empty, empty, "", ""

    bugs_result = detect_bugs(code, model_key)
    improvements_result = suggest_improvements(code, model_key)
    docstring_result = add_docstrings(code, model_key)

    test_code = ""
    if include_tests:
        tests_result = generate_tests(code, bugs_result.get("issues", []), model_key)
        test_code = format_tests_output(tests_result)

    return (
        format_bugs_output(bugs_result),
        format_improvements_output(improvements_result),
        test_code,
        format_docstring_output(docstring_result),
    )


def compare_models(code: str, model_keys: list[str] | None = None) -> str:
    """Run bug detection across several models and summarise results.

    Args:
        code: Python source code to analyse.
        model_keys: Keys of models to compare. Defaults to all configured models.

    Returns:
        Markdown-formatted comparison report.
    """
    if not code.strip():
        return "Please provide code to review."

    keys = model_keys or list(MODELS.keys())
    results = [detect_bugs(code, k) for k in keys]

    lines = ["##Model Comparison\n"]

    # Per-model summary
    for res in results:
        model_name = res["model"]
        if not res.get("success"):
            lines.append(f"**{model_name}**: {res.get('error', 'Unknown error')}")
            continue

        issues = res.get("issues", [])
        severity_counts: dict[str, int] = {}
        for issue in issues:
            sev = issue.get("severity", "low")
            severity_counts[sev] = severity_counts.get(sev, 0) + 1

        breakdown = ", ".join(f"{k}: {v}" for k, v in sorted(severity_counts.items()))
        summary = f"({breakdown})" if breakdown else "(no issues)"
        lines.append(f"**{model_name}**: {len(issues)} issue(s) {summary}")

    # Consensus issues â€” found by 2+ models on the same line
    if len(results) > 1:
        sig_to_models: dict[str, list[str]] = {}
        for res in results:
            if not res.get("success"):
                continue
            for issue in res.get("issues", []):
                sig = f"{issue.get('line')}-{issue.get('issue', '')[:50]}"
                sig_to_models.setdefault(sig, []).append(res["model"])

        consensus = [(sig, ms) for sig, ms in sig_to_models.items() if len(ms) > 1]
        if consensus:
            lines.append(f"\n###Consensus Issues ({len(consensus)} agreed by 2+ models)")
            for sig, ms in consensus:
                lines.append(f"  - `{sig}` â€” flagged by: {', '.join(ms)}")
        else:
            lines.append("\n*No consensus issues â€” models found different problems.*")

    return "\n".join(lines)

## 7. Gradio UI

In [None]:
EXAMPLE_CODE = '''\
def divide_numbers(a, b):
    return a / b  # ZeroDivisionError if b == 0


def process_user_data(user_input):
    # SECURITY: eval executes arbitrary code â€” never use with untrusted input
    result = eval(user_input)
    return result


def get_user_by_id(user_id):
    # SQL injection: user_id is inserted directly into the query string
    query = f"SELECT * FROM users WHERE id = {user_id}"
    return query


def calculate_average(numbers):
    total = sum(numbers)
    return total / len(numbers)  # ZeroDivisionError if numbers is empty
'''

FIBONACCI_EXAMPLE = '''\
def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n - 1) + fibonacci(n - 2)
'''

CONFIG_EXAMPLE = '''\
def parse_config(file_path):
    with open(file_path) as f:
        return eval(f.read())
'''


def create_ui() -> gr.Blocks:
    """Build and return the Gradio interface."""
    model_choices = [(f"{cfg['name']} â€” {cfg['description']}", key) for key, cfg in MODELS.items()]
    default_model = list(MODELS.keys())[0] if MODELS else None

    with gr.Blocks(
        title="AI Code Review Assistant",
        theme=gr.themes.Soft(),
        css=".tab-nav { font-size: 1rem !important; }",
    ) as demo:
        gr.Markdown(
            """
#AI-Powered Code Review Assistant
Analyse Python code with open-source LLMs â€” detect bugs, suggest improvements, generate tests & docstrings.
"""
        )

        with gr.Row(equal_height=False):
            # â”€â”€ Left panel â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
            with gr.Column(scale=2, min_width=360):
                code_input = gr.Code(
                    label="Python Code",
                    value=EXAMPLE_CODE,
                    language="python",
                    lines=22,
                )

                model_selector = gr.Dropdown(
                    choices=model_choices,
                    value=default_model,
                    label="Model",
                    info="Open-source model via OpenRouter",
                    interactive=bool(MODELS),
                )

                include_tests = gr.Checkbox(label="Generate unit tests", value=True)

                with gr.Row():
                    review_btn = gr.Button("Review Code", variant="primary", scale=2)
                    compare_btn = gr.Button("Compare All Models", variant="secondary", scale=1)

                gr.Examples(
                    examples=[[EXAMPLE_CODE], [FIBONACCI_EXAMPLE], [CONFIG_EXAMPLE]],
                    inputs=[code_input],
                    label="Example snippets",
                )

            # â”€â”€ Right panel â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
            with gr.Column(scale=3, min_width=480):
                with gr.Tabs():
                    with gr.Tab("Bugs"):
                        bugs_output = gr.Markdown(
                            value="*Select a model and click **Review Code** to begin.*"
                        )
                    with gr.Tab("Improvements"):
                        improvements_output = gr.Markdown(
                            value="*Improvement suggestions will appear here.*"
                        )
                    with gr.Tab("Tests"):
                        tests_output = gr.Code(
                            label="Generated pytest suite",
                            language="python",
                            lines=28,
                        )
                    with gr.Tab("Docstrings"):
                        docstring_output = gr.Code(
                            label="Documented code",
                            language="python",
                            lines=28,
                        )
                    with gr.Tab("Comparison"):
                        comparison_output = gr.Markdown(
                            value="*Click **Compare All Models** to see a side-by-side analysis.*"
                        )

        review_btn.click(
            fn=review_code,
            inputs=[code_input, model_selector, include_tests],
            outputs=[bugs_output, improvements_output, tests_output, docstring_output],
        )

        compare_btn.click(
            fn=lambda code: compare_models(code),
            inputs=[code_input],
            outputs=[comparison_output],
        )

    return demo


demo = create_ui()
demo.launch(inbrowser=True, share=False)

## 8. Quick Notebook Test

In [None]:
# Sanity-check: run bug detection on a trivial snippet
if MODELS:
    _test_code = """\
def divide(a, b):
    return a / b
"""
    _result = detect_bugs(_test_code, list(MODELS.keys())[0])
    print(format_bugs_output(_result))
else:
    print("No models configured â€” skipping test.")