## Week 4 Day 5: LLM Coding Challenge Arena

**Workflow:**
1. Submit a coding problem
2. Two selected LLMs each generate a solution
3. Each LLM generates unit tests for its own solution
4. Solutions are executed, scored on pass %, runtime, and Pylint quality
5. A winner banner is displayed based on composite score

In [None]:
import gradio as gr
from helper_functions import (
    generate_code,
    generate_unit_tests,
    parse_tests,
    evaluate_solution,
    display_metrics,
    display_winner,
)

In [None]:
MODELS = [
    {"name": "GPT-4.1 Mini", "slug": "openai/gpt-4.1-mini"},
    {"name": "Claude 3.5 Sonnet", "slug": "anthropic/claude-3.5-sonnet"},
    {"name": "GPT-4o Mini", "slug": "openai/gpt-4o-mini"},
    {"name": "Gemini 2.5 Pro", "slug": "google/gemini-2.5-pro"},
    {"name": "GPT-oss-20b", "slug": "openai/gpt-oss-20b"},
    {"name": "Qwen3.5 Plus 2026-02-15", "slug": "qwen/qwen3.5-plus-02-15"},
]

In [None]:
def blank_state():
    return [
        "Ready ‚Äî enter a problem and select two LLMs to compete.",
        gr.update(visible=False),
        "",
        "",
        "",
        gr.update(visible=False),
        "",
        "",
        "",
        "",
    ]

In [None]:
def run_challenge(problem: str, model1_name: str, model2_name: str):
    if not problem.strip():
        out = blank_state()
        out[0] = "‚ö†Ô∏è Please enter a coding problem first."
        yield out
        return

    model1_slug = next(m["slug"] for m in MODELS if m["name"] == model1_name)
    model2_slug = next(m["slug"] for m in MODELS if m["name"] == model2_name)
    results = []

    def mid(msg, *, r1=False, c1="", t1="", m1="", r2=False, c2="", t2="", m2=""):
        """Intermediate yield ‚Äî keeps banner empty until the final frame."""
        return [msg, gr.update(visible=r1), c1, t1, m1, gr.update(visible=r2), c2, t2, m2, ""]

    yield mid(f"‚è≥ {model1_name} is generating solution...")
    code1 = generate_code(model1_slug, problem)

    yield mid(f"‚è≥ {model1_name} is generating unit tests...")
    tests1, test_display1 = parse_tests(
        generate_unit_tests(problem, code1, model1_slug, focus="edge")
    )

    if not tests1:
        yield mid(f"‚ùå {model1_name} failed to generate valid tests.",
                   r1=True, c1=code1, t1=test_display1, m1="Failed to generate valid tests")
        return

    yield mid(f"‚è≥ Evaluating {model1_name} solution...")
    pass_pct1, runtime_ms1, pylint1, error1 = evaluate_solution(code1, tests1)
    result1 = {
        "model_name": model1_name, "slug": model1_slug,
        "code": code1, "tests": test_display1,
        "pass_pct": pass_pct1, "runtime_ms": runtime_ms1,
        "runtime_ms_str": f"{runtime_ms1:.1f}" if runtime_ms1 != float("inf") else "N/A",
        "pylint": pylint1, "error": error1,
    }
    results.append(result1)
    metrics1 = display_metrics(result1)

    yield mid(f"‚è≥ {model2_name} is generating solution...",
               r1=True, c1=code1, t1=test_display1, m1=metrics1)
    code2 = generate_code(model2_slug, problem)

    yield mid(f"‚è≥ {model2_name} is generating unit tests...",
               r1=True, c1=code1, t1=test_display1, m1=metrics1)
    tests2, test_display2 = parse_tests(
        generate_unit_tests(problem, code2, model2_slug, focus="typical")
    )

    if not tests2:
        yield mid(f"‚ùå {model2_name} failed to generate valid tests.",
                   r1=True, c1=code1, t1=test_display1, m1=metrics1,
                   r2=True, c2=code2, t2=test_display2, m2="Failed to generate valid tests")
        return

    yield mid(f"‚è≥ Evaluating {model2_name} solution...",
               r1=True, c1=code1, t1=test_display1, m1=metrics1)
    pass_pct2, runtime_ms2, pylint2, error2 = evaluate_solution(code2, tests2)
    result2 = {
        "model_name": model2_name, "slug": model2_slug,
        "code": code2, "tests": test_display2,
        "pass_pct": pass_pct2, "runtime_ms": runtime_ms2,
        "runtime_ms_str": f"{runtime_ms2:.1f}" if runtime_ms2 != float("inf") else "N/A",
        "pylint": pylint2, "error": error2,
    }
    results.append(result2)
    metrics2 = display_metrics(result2)

    n_pass = sum(1 for r in results if r["pass_pct"] == 100)
    yield [
        f"‚úÖ Challenge complete! {n_pass}/2 solutions passed all tests.",
        gr.update(visible=True), code1, test_display1, metrics1,
        gr.update(visible=True), code2, test_display2, metrics2,
        display_winner(results),
    ]


def clear_challenge():
    return blank_state() + [""]

In [None]:
# Gradio UI
CSS = open("styles.css").read()
with gr.Blocks(title="LLM Coding Challenge Arena", css=CSS) as demo:

    gr.HTML("""
    <div id="arena-header">
      <h1>üèÜ LLM Coding Challenge Arena</h1>
      <p style="color:#64748b">Two LLMs compete ¬∑ Each generates solution + unit tests</p>
    </div>
    """)

    with gr.Row():
        with gr.Column(scale=3):
            problem_in = gr.Textbox(
                label="Coding Problem",
                placeholder="e.g. Write a function to return the nth Fibonacci number",
                lines=3,
            )
            gr.Examples(
                examples=[
                    ["Given an integer x, return true if x is a palindrome, and false otherwise."],
                    ["Given an integer array nums, return all the triplets [nums[i], nums[j], nums[k]] such that i != j, i != k, and j != k, and nums[i] + nums[j] + nums[k] == 0.\n\nNotice that the solution set must not contain duplicate triplets."],
                    ["You are given two non-empty linked lists representing two non-negative integers. The digits are stored in reverse order, and each of their nodes contains a single digit. Add the two numbers and return the sum as a linked list.\n\nYou may assume the two numbers do not contain any leading zero, except the number 0 itself"],
                ],
                inputs=problem_in,
                label="Examples (Click to load)"
            )
        with gr.Column(scale=2):
            model1_sel = gr.Dropdown(
                choices=[m["name"] for m in MODELS], value=MODELS[0]["name"], label="LLM Competitor 1"
            )
            model2_sel = gr.Dropdown(
                choices=[m["name"] for m in MODELS], value=MODELS[1]["name"], label="LLM Competitor 2"
            )

    with gr.Row():
        gen_btn   = gr.Button("üöÄ Start Challenge", variant="primary",   size="lg")
        clear_btn = gr.Button("üóëÔ∏è Clear",           variant="secondary", size="lg")

    status_out        = gr.Markdown("Ready ‚Äî enter a problem and select two LLMs to compete.")
    winner_banner_out = gr.HTML("")

    gr.Markdown("---\n## Competitor Solutions")

    model1_accordion = gr.Accordion("ü§ñ Competitor 1", open=True, visible=False)
    with model1_accordion:
        model1_code_out    = gr.Code(language="python", label="Solution")
        model1_test_out    = gr.Code(language="python", label="Unit Tests  „Äîfocus: edge cases & boundaries„Äï")
        model1_metrics_out = gr.Markdown()

    model2_accordion = gr.Accordion("ü§ñ Competitor 2", open=True, visible=False)
    with model2_accordion:
        model2_code_out    = gr.Code(language="python", label="Solution")
        model2_test_out    = gr.Code(language="python", label="Unit Tests  „Äîfocus: typical use cases & breadth„Äï")
        model2_metrics_out = gr.Markdown()

    challenge_outputs = [
        status_out,
        model1_accordion, model1_code_out, model1_test_out, model1_metrics_out,
        model2_accordion, model2_code_out, model2_test_out, model2_metrics_out,
        winner_banner_out,
    ]

    gen_btn.click(fn=run_challenge, inputs=[problem_in, model1_sel, model2_sel], outputs=challenge_outputs)
    clear_btn.click(fn=clear_challenge, inputs=[], outputs=challenge_outputs + [problem_in])

    demo.launch(inbrowser=True)