In [None]:
import os
import json
import re
import time
import traceback
from pathlib import Path

from dotenv import load_dotenv
import gradio as gr
import ollama
from openai import OpenAI


In [None]:
load_dotenv()

# Load tokens/keys once so the UI fails fast with a clear error instead of hanging mid-request.
HF_TOKEN = os.getenv("HF_TOKEN", "")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "")

# The judge should be stable and reasonably strong; this is the one that decides the winner.
OPENAI_JUDGE_MODEL = "gpt-4o-mini"

# Nano is a third contender generator; it competes against local Llama and HF free.
OPENAI_NANO_MODEL = "gpt-4.1-nano"

# Local generator model served by Ollama; keep the tag explicit to avoid accidental model drift.
LLAMA_MODEL = "llama3.1:8b"

# Free remote generator via HF router; this is server-side inference with quota/rate limits.
HF_FREE_CHAT_MODEL = "HuggingFaceTB/SmolLM3-3B:hf-inference"
HF_ROUTER_BASE_URL = "https://router.huggingface.co/v1"

# Two clients: OpenAI for judge + nano, HF router client for the free contender.
openai = OpenAI()
hf_client = OpenAI(base_url=HF_ROUTER_BASE_URL, api_key=HF_TOKEN)


In [None]:
def notebook_code_extractor(path: str) -> str:
    # Read the .ipynb as JSON and extract only code cells to avoid markdown noise in the prompt.
    nb = json.loads(Path(path).read_text(encoding="utf-8"))

    # Preserve cell order to keep context consistent with how the notebook is actually executed.
    parts = []
    for cell in nb.get("cells", []):
        if cell.get("cell_type") != "code":
            continue

        # Join the source array into a single string per cell; .ipynb stores it line-by-line.
        parts.append("".join(cell.get("source", [])))

    # Separate cells with blank lines so function boundaries remain readable to the model.
    return "\n\n".join(parts).strip()


In [None]:
# Keep prompts stable; otherwise the judge is comparing outputs produced under different instructions.
system_message_comments = (
    "You are a senior developer. Improve the code documentation by adding docstrings and short, useful comments. "
    "Keep it natural and practical. Do not over-comment obvious lines. "
    "Reply with code only."
)

system_message_summary = (
    "You are a senior developer. Summarize the code clearly: what it does, overall flow, inputs/outputs, and key points. "
    "Do not show the code. Do not use Markdown. Reply with plain text only."
)

def user_prompt_for(code: str) -> str:
    # A single, explicit instruction reduces variance across different backends.
    return "Add docstrings and helpful comments. Reply with code only.\n\n" + code

def user_prompt_for_summary(code: str) -> str:
    # Summary prompt is separated so models don't “leak” code back into the summary.
    return "Summarize this code.\n\n" + code

def messages_for(code: str):
    # System+user is supported by OpenAI and by HF router chat completions.
    return [
        {"role": "system", "content": system_message_comments},
        {"role": "user", "content": user_prompt_for(code)},
    ]

def messages_for_summary(code: str):
    return [
        {"role": "system", "content": system_message_summary},
        {"role": "user", "content": user_prompt_for_summary(code)},
    ]


In [None]:
def call_llama_local(code: str):
    # Ollama runs locally; this path is “free” after the model is pulled.
    r1 = ollama.chat(model=LLAMA_MODEL, messages=messages_for(code))
    r2 = ollama.chat(model=LLAMA_MODEL, messages=messages_for_summary(code))

    # Normalize return shape to plain strings so the benchmark pipeline is backend-agnostic.
    return r1["message"]["content"], r2["message"]["content"]

def call_hf_free(code: str):
    # HF router calls are remote; this requires HF_TOKEN and is subject to quota/latency.
    if not HF_TOKEN:
        raise RuntimeError("HF_TOKEN is not set in your environment.")

    c1 = hf_client.chat.completions.create(
        model=HF_FREE_CHAT_MODEL,
        messages=messages_for(code),
        max_tokens=1000,  # Cap output so latency/cost don’t explode on large notebooks.
    )
    c2 = hf_client.chat.completions.create(
        model=HF_FREE_CHAT_MODEL,
        messages=messages_for_summary(code),
        max_tokens=1000,
    )

    return c1.choices[0].message.content, c2.choices[0].message.content

def call_gpt_nano(code: str):
    # Nano competes as a generator; the stronger judge is kept separate to reduce bias.
    c1 = openai.chat.completions.create(model=OPENAI_NANO_MODEL, messages=messages_for(code))
    c2 = openai.chat.completions.create(model=OPENAI_NANO_MODEL, messages=messages_for_summary(code))
    return c1.choices[0].message.content, c2.choices[0].message.content


In [None]:
def run_model_once(model_name: str, code: str):
    # Use wall-clock latency (perf_counter) to include network time for remote models.
    t0 = time.perf_counter()

    m = (model_name or "").strip().lower()
    if m.startswith("llama"):
        commented, summary = call_llama_local(code)
    elif m.startswith("hf"):
        commented, summary = call_hf_free(code)
    else:
        commented, summary = call_gpt_nano(code)

    # This time is the single number used later for value scoring (score per second).
    return commented, summary, (time.perf_counter() - t0)


In [None]:
def _extract_score(text: str) -> float:
    # Parsing is intentionally forgiving; we only need the first "score: X" pattern.
    m = re.search(r"\bscore\s*[:=]\s*([0-9]+(?:\.[0-9]+)?)", text, flags=re.IGNORECASE)
    return float(m.group(1)) if m else 0.0

def judge_quality_llm(code: str, commented: str, summary: str) -> str:
    # Constrain the judge output tightly so the benchmark remains machine-parsable.
    rubric = (
        "You are a strict code reviewer. Evaluate the assistant output for the given original code.\n"
        "Return a short verdict with a single numeric score from 0 to 10.\n"
        "Criteria (equal weight):\n"
        "1) Correctness: comments/docstrings match what code does (no hallucinations).\n"
        "2) Usefulness: captures intent, assumptions, edge cases, and non-obvious behavior.\n"
        "3) Clarity: readable, consistent, avoids redundant commentary.\n"
        "4) Naturalness: reads like a human developer wrote it.\n"
        "Output format (exact):\n"
        "score: <number>\n"
        "notes: <one paragraph>\n"
    )

    # Feed the judge the original and both outputs; this is the minimum context to score quality.
    payload = (
        "ORIGINAL CODE:\n"
        f"{code}\n\n"
        "COMMENTED CODE:\n"
        f"{commented}\n\n"
        "SUMMARY:\n"
        f"{summary}\n"
    )

    messages = [
        {"role": "system", "content": rubric},
        {"role": "user", "content": payload},
    ]

    # Judge is fixed to gpt-4o-mini to avoid a moving target; only contenders vary.
    c = openai.chat.completions.create(model=OPENAI_JUDGE_MODEL, messages=messages, max_tokens=400)
    return c.choices[0].message.content


In [None]:
def benchmark_and_pick_winner(file_obj):
    # This function is called by Gradio; it returns (report, extracted_code, winner_commented, winner_summary).
    try:
        if file_obj is None:
            return "ERROR: Please upload a .ipynb file.", "", "", ""

        # Gradio provides a temporary file path on disk for the upload.
        path = file_obj.name
        if not path.lower().endswith(".ipynb"):
            return "ERROR: The uploaded file is not a .ipynb notebook.", "", "", ""

        # Extract notebook code once so all contenders see identical input.
        code = notebook_code_extractor(path)
        if not code.strip():
            return "ERROR: No code cells found in the notebook.", "", "", ""

        # Three contenders: free remote (HF), local (Llama), paid/cheap (Nano).
        candidates = ["HF (free)", "Llama (local)", "GPT (nano)"]
        results = {}

        for name in candidates:
            # Generate outputs and measure latency for value scoring.
            commented, summary, secs = run_model_once(name, code)

            # Judge quality independently so we can compare across different generators.
            verdict = judge_quality_llm(code, commented, summary)
            score = _extract_score(verdict)

            # Score per second favors models that are both good and fast.
            value = (score / secs) if secs > 0 else 0.0

            results[name] = {
                "commented": commented,
                "summary": summary,
                "secs": secs,
                "verdict": verdict,
                "score": score,
                "value": value,
            }

        # Pick winner by value first; break ties using raw score so quality wins if times are similar.
        winner = max(results.items(), key=lambda kv: (kv[1]["value"], kv[1]["score"]))[0]
        w = results[winner]

        # Build a compact, readable report to justify the winner choice.
        lines = []
        for name in candidates:
            r = results[name]
            lines.append(f"{name}: score={r['score']:.2f}, time={r['secs']:.3f}s, score/time={r['value']:.3f}")
        lines.append("")
        lines.append(f"WINNER: {winner}")
        lines.append("")
        lines.append("Judge verdict (winner):")
        lines.append(w["verdict"].strip())

        return "\n".join(lines), code, w["commented"], w["summary"]

    except Exception:
        # Returning the traceback into the UI makes failures debuggable without crashing the Gradio queue.
        return "ERROR:\n" + traceback.format_exc(), "", "", ""


In [None]:
css = """
.comments {background-color: #00599C;}
.summary {background-color: #008B8B;}
"""

with gr.Blocks(css=css) as ui:
    gr.Markdown(
        "### Notebook Documentation Tool\n"
        "Upload a notebook, generate docs with three models, and rank them with a GPT-4o-mini judge."
    )

    with gr.Row():
        nb_file = gr.File(label="Upload .ipynb", file_types=[".ipynb"])

    with gr.Row():
        run_bench = gr.Button("Generate and pick winner")

    with gr.Row():
        report = gr.Textbox(label="Benchmark report", lines=10)

    with gr.Row():
        source_code = gr.Textbox(label="Extracted notebook code (read-only)", lines=14, interactive=False)

    with gr.Row():
        commented_code = gr.Textbox(label="Winner: documented code", lines=14, elem_classes=["comments"])
        code_summary = gr.Textbox(label="Winner: summary", lines=14, elem_classes=["summary"])

    run_bench.click(
        benchmark_and_pick_winner,
        inputs=[nb_file],
        outputs=[report, source_code, commented_code, code_summary],
    )

ui.launch(inbrowser=True)
