In [1]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1


In [None]:
import os, json, time, requests
from typing import List, Dict, Optional

OPENROUTER_API_KEY = "{PUT_IN_KEY_HERE}"
#OPENROUTER_API_KEY = os.environ["OPENROUTER_API_KEY"]
ENDPOINT = "https://openrouter.ai/api/v1/chat/completions"

JUDGE_SYSTEM = """You are an impartial LLM judge.
Score the USER prompt's **relevance to the topic**: "{topic}".
Return a JSON object: {{"score": <integer 0-100>, "rationale": "<one short sentence>"}}.
Scoring rubric:
- 90–100: Directly about the topic.
- 70–89: Mostly about the topic; minor drift.
- 40–69: Tangential/related cooking/baking; limited link to the topic.
- 10–39: Barely related.
- 0–9: Unrelated or off-topic.
Output ONLY valid JSON; no extra text."""

def _one_call(prompt: str,
              topic: str = "baking a cake",
              model: str = "gpt-3.5-turbo",
              timeout_s: int = 60) -> Dict:
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        # Optional app headers shown in docs:
        # "HTTP-Referer": "https://your.app", "X-Title": "LLM Judge"
    }
    body = {
        "model": model,
        "response_format": {"type": "json_object"},
        "temperature": 0,            # deterministic judging
        "max_tokens": 128,
        "messages": [
            {"role": "system", "content": JUDGE_SYSTEM.format(topic=topic)},
            {"role": "user", "content": prompt}
        ],
    }
    r = requests.post(ENDPOINT, headers=headers, data=json.dumps(body), timeout=timeout_s)
    r.raise_for_status()
    msg = r.json()["choices"][0]["message"]["content"]
    print(msg) # Print the raw response
    return json.loads(msg)

def score_prompts(prompts: List[str],
                  topic: str = "baking a cake",
                  model: str = "gpt-3.5-turbo",
                  per_prompt_votes: int = 1,
                  sleep_between_s: float = 0.0) -> Dict[str, Dict[str, Optional[float]]]:
    out = {}
    for p in prompts:
        votes = []
        reasons = []
        for _ in range(per_prompt_votes):
            res = _one_call(p, topic=topic, model=model)
            score = int(res.get("score", 0))
            votes.append(score)
            reasons.append(res.get("rationale", ""))
            if sleep_between_s: time.sleep(sleep_between_s)
        avg = sum(votes) / len(votes)
        out[p] = {
            "scores": votes,
            "avg": avg,
            "min": min(votes),
            "max": max(votes),
            "rationales": reasons,
        }
    return out

# Example
if __name__ == "__main__":
    prompts = [
        "How do I frost a layer cake so it’s perfectly smooth?",
        "What’s the best way to sear a steak in cast iron?",
        "Step-by-step recipe for a moist chocolate sponge cake.",
        "Explain the Roman Empire’s fall in one paragraph."
    ]
    results = score_prompts(prompts, topic="baking a cake", model="gpt-3.5-turbo", per_prompt_votes=3, sleep_between_s=0.2)
    for k, v in results.items():
        print(k, "=>", v["avg"], v["scores"], v["rationales"][0])

{"score": 100, "rationale": "Directly addresses the topic of baking a cake."}
{"score": 100, "rationale": "Directly addresses the topic of baking a cake."}
{"score": 95, "rationale": "Directly addresses the topic of baking a cake and the specific step of frosting"}
{"score": 10, "rationale": "Barely related."}
{"score": 10, "rationale": "Barely related."}
{"score": 10, "rationale": "Barely related."}
{"score": 100, "rationale": "Directly provides a recipe for baking a cake."}
{"score": 100, "rationale": "Directly provides a recipe for baking a cake."}
{"score": 100, "rationale": "Directly provides a recipe for baking a cake."}
{"score": 0, "rationale": "Unrelated to baking a cake."}
{"score": 0, "rationale": "Unrelated to baking a cake."}
{"score": 0, "rationale": "Unrelated to baking a cake."}
How do I frost a layer cake so it’s perfectly smooth? => 98.33333333333333 [100, 100, 95] Directly addresses the topic of baking a cake.
What’s the best way to sear a steak in cast iron? => 10.0