# Week 4: Reasoning & Code Benchmark System

Benchmark **3 models** on generated difficult questions. **Reasoning** or **Code** mode. Select models via checkboxes; run benchmark to evaluate all selected models; view combined results and rankings. Runs locally and on Colab.

In [None]:
!pip install -q gradio python-dotenv openai pandas

In [None]:
import os, time, re
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr
import pandas as pd

In [None]:
try:
    from google.colab import userdata
    IN_COLAB, api_key = True, userdata.get("OPENROUTER_API_KEY")
except Exception:
    load_dotenv(override=True)
    IN_COLAB, api_key = False, os.getenv("OPENROUTER_API_KEY")
openrouter = OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1") if api_key else None
ollama = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
QUESTION_GEN_MODEL = "openai/gpt-4o"
if IN_COLAB:
    MODEL_CHOICES = [
        ("gpt-4o-mini", "openai/gpt-4o-mini", "openrouter"),
        ("Gemma 2 9B", "google/gemma-2-9b-it:free", "openrouter"),
        ("Llama 3.1 8B", "meta-llama/llama-3.1-8b-instruct:free", "openrouter"),
    ]
else:
    MODEL_CHOICES = [
        ("gpt-4o-mini (OpenRouter)", "gpt-4o-mini", "openrouter"),
        ("llama3.2 (Ollama)", "llama3.2", "ollama"),
        ("Gemma 2 9B (OpenRouter)", "google/gemma-2-9b-it:free", "openrouter"),
    ]
def get_client(b):
    return openrouter if b == "openrouter" else ollama

In [None]:
BENCHMARK_MODES, NUM_QUESTIONS, benchmark_store = ["reasoning", "code"], 3, []
Q_SYSTEM = "Output only one question. No examples, no preamble, no explanation. Just the question, ending with ?"

def _extract_one_question(raw: str) -> str:
    raw = raw.strip()
    if not raw or len(raw) < 10:
        return ""
    idx = raw.find("?")
    if idx != -1:
        return raw[: idx + 1].strip()
    return raw[:400].strip()

def _gen_one_question(mode: str) -> str:
    if not openrouter:
        return ""
    if mode == "reasoning":
        user = "One difficult logical or mathematical reasoning question only. No code."
    else:
        user = "One difficult programming or algorithm question only. No general logic puzzles."
    try:
        r = openrouter.chat.completions.create(
            model=QUESTION_GEN_MODEL,
            messages=[{"role": "system", "content": Q_SYSTEM}, {"role": "user", "content": user}],
            max_tokens=256,
        )
        raw = (r.choices[0].message.content or "").strip()
        return _extract_one_question(raw)
    except Exception:
        return ""

def generate_synthetic_questions(mode, n=NUM_QUESTIONS):
    qs = []
    for _ in range(n * 2):
        if len(qs) >= n:
            break
        q = _gen_one_question(mode)
        if q and q not in qs:
            qs.append(q)
    return qs if qs else [f"No question for {mode}. Set OPENROUTER_API_KEY and try again."]
def get_mid_back(lbl):
    d = {l: (m, b) for l, m, b in MODEL_CHOICES}
    return d.get(lbl, MODEL_CHOICES[0][1:])
def run_model(client, mid, qs, mode):
    sys = "Answer concisely. Reasoning: clear steps. Code: working code + brief explanation."
    t0 = time.perf_counter()
    ans = []
    for q in qs:
        try:
            r = client.chat.completions.create(model=mid, messages=[{"role": "system", "content": sys}, {"role": "user", "content": q}], max_tokens=1024)
            ans.append((r.choices[0].message.content or "").strip())
        except Exception as e:
            ans.append(f"[Error: {e}]")
    return ans, time.perf_counter() - t0
def score_answers(qs, ans, mode, client, mid):
    rub = "Score correctness/clarity 1-10. Reply with one number."
    sc = []
    for q, a in zip(qs, ans):
        try:
            r = client.chat.completions.create(model=mid, messages=[{"role": "system", "content": rub}, {"role": "user", "content": f"Q: {q}\nA: {a}"}], max_tokens=10)
            raw = (r.choices[0].message.content or "5").strip()
            sc.append(max(1, min(10, float("".join(c for c in raw if c.isdigit() or c == ".") or "5"))))
        except Exception:
            sc.append(5.0)
    return sc

In [None]:
def run_benchmark_single(mode, model_label, questions):
    if not questions or questions[0].startswith("No question"):
        return None, None
    mid, back = get_mid_back(model_label)
    client = get_client(back)
    if not client:
        return f"[{model_label}] No client (set OPENROUTER_API_KEY or Ollama).", None
    ans, elapsed = run_model(client, mid, questions, mode)
    scores = score_answers(questions, ans, mode, client, mid)
    avg = sum(scores) / len(scores) if scores else 0
    acc = sum(1 for s in scores if s >= 7) / len(scores) * 100 if scores else 0
    m = {"avg_score": round(avg, 2), "accuracy_7": round(acc, 1), "time_sec": round(elapsed, 2), "n": len(questions)}
    benchmark_store.append({"mode": mode, "model_name": model_label, "questions": questions, "answers": ans, "scores": scores, "time_sec": elapsed, **m})
    md = "\n\n---\n\n".join(f"**Q{i+1}:** {q[:80]}...\n**A:** {a[:200]}...\n*Score: {s}*" for i, (q, a, s) in enumerate(zip(questions, ans, scores)))
    return md, m

def run_benchmark_all(mode, model_labels, questions):
    if not questions or questions[0].startswith("No question"):
        return "Generate questions first.", "No questions"
    if not model_labels:
        return "Select at least one model (checkboxes).", "No models selected"
    parts = []
    status_parts = []
    for lbl in model_labels:
        md, m = run_benchmark_single(mode, lbl, questions)
        if md is None and m is None:
            continue
        if m is None:
            parts.append(f"### {lbl}\n{md}")
            status_parts.append(f"{lbl}: skipped")
            continue
        parts.append(f"### {lbl}\n**Avg score:** {m['avg_score']} | **Accuracy (≥7):** {m['accuracy_7']}% | **Time:** {m['time_sec']}s\n\n{md}")
        status_parts.append(f"{lbl}: {m['avg_score']}")
    return "\n\n---\n\n".join(parts), "Done. " + ", ".join(status_parts)
def get_table():
    if not benchmark_store:
        return pd.DataFrame(columns=["Rank", "Model", "Mode", "Avg Score", "Accuracy (≥7)", "Time (s)", "N"])
    df = pd.DataFrame([{"Model": r["model_name"], "Mode": r["mode"], "Avg Score": r["avg_score"], "Accuracy (≥7)": f"{r['accuracy_7']}%", "Time (s)": r["time_sec"], "N": r["n"]} for r in benchmark_store])
    df = df.sort_values("Avg Score", ascending=False).reset_index(drop=True)
    df.insert(0, "Rank", range(1, len(df) + 1))
    return df
def get_chart_df():
    return pd.DataFrame([{"model": r["model_name"], "avg_score": r["avg_score"]} for r in benchmark_store]) if benchmark_store else pd.DataFrame({"model": [], "avg_score": []})

In [None]:
def ui_gen(mode):
    return "\n\n".join(f"**{i}.** " + q for i, q in enumerate(generate_synthetic_questions(mode, NUM_QUESTIONS), 1))

def initial_reasoning_questions():
    q = ui_gen("reasoning")
    return q, q
def parse_q(text):
    if not (text and text.strip()):
        return []
    text = re.sub(r"\*\*\d+\.\*\*\s*", "", text)
    blocks = [b.strip() for b in text.split("\n\n") if b.strip()]
    return (blocks if blocks else [ln.strip() for ln in text.split("\n") if ln.strip()])[:NUM_QUESTIONS]
def ui_run(mode, selected_models, text):
    qs = parse_q(text)
    if not qs:
        return "Paste/generate questions first.", ""
    md, status = run_benchmark_all(mode, selected_models or [], qs)
    return md, status
def ui_refresh():
    return get_table(), get_chart_df()

In [None]:
with gr.Blocks(title="Reasoning & Code Benchmark", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Reasoning & Code Benchmark (contest: 3 models)")
    gr.Markdown("Generate questions → select models (checkboxes) → run benchmark for all selected → view answers and Performance.")
    with gr.Tabs():
        with gr.TabItem("1. Contest"):
            mode_c = gr.Radio(choices=BENCHMARK_MODES, value="reasoning", label="Mode")
            q_out = gr.Markdown(label="Questions")
            gr.Button("Generate questions").click(fn=ui_gen, inputs=[mode_c], outputs=[q_out])
        with gr.TabItem("2. Run & Answers"):
            mode_r = gr.Radio(choices=BENCHMARK_MODES, value="reasoning", label="Mode")
            model_labels = [l for l, _, _ in MODEL_CHOICES]
            model_checkboxes = gr.CheckboxGroup(choices=model_labels, value=model_labels, label="Models to benchmark (select 1–3)")
            q_in = gr.Textbox(placeholder="Paste questions from Contest (or use auto-loaded reasoning questions)", lines=10, label="Questions")
            btn_run = gr.Button("Run benchmark (all selected models)")
            ans_out, status_out = gr.Markdown(label="Answers (all models)"), gr.Textbox(label="Status", interactive=False)
            btn_run.click(fn=ui_run, inputs=[mode_r, model_checkboxes, q_in], outputs=[ans_out, status_out])
        with gr.TabItem("3. Performance"):
            gr.Markdown("Ranking & metrics for **all models** from every benchmark run.")
            perf_t = gr.Dataframe(label="Ranking", interactive=False)
            perf_c = gr.BarPlot(get_chart_df(), x="model", y="avg_score", title="Avg score by model", vertical=False)
            gr.Button("Refresh").click(fn=ui_refresh, inputs=[], outputs=[perf_t, perf_c])
    demo.load(fn=initial_reasoning_questions, inputs=[], outputs=[q_out, q_in])
demo.launch()