# Comparing Prompts on MMLU with Statistical Testing

A quick end-to-end experiment: two prompts, 30 MMLU questions, and proper significance testing to see if the difference actually matters.

## 1. Dependencies


In [20]:
!pip install datasets openai numpy scipy -q

## 2. API Key

In [21]:
import os
from getpass import getpass

# Enter your API key (it won't be displayed)
api_key = getpass("Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = api_key


Enter your OpenAI API key: ··········


## 3. Statistical Functions

These are pulled from the `rigor` package — defined inline here so nothing extra to install.

In [22]:
import numpy as np
from scipy import stats
from dataclasses import dataclass
from typing import List, Literal, Union

# ---------------------------------------------------------------------------
# RIGOR: Statistical functions for LLM experiments
# ---------------------------------------------------------------------------

@dataclass
class ComparisonResult:
    """Result of comparing two sets of scores."""
    test_name: str
    statistic: float
    p_value: float
    significant: bool
    effect_size: float
    effect_size_interpretation: str
    ci_low: float
    ci_high: float
    mean_a: float
    mean_b: float
    mean_difference: float
    n_samples: int
    alpha: float

    def __repr__(self):
      sig_str = "significant" if self.significant else "not significant"
      return (
          f"\n{self.test_name}\n"
          f"{'-'*50}\n"
          f"p-value: {self.p_value:.4f} ({sig_str} at alpha={self.alpha})\n"
          f"\n"
          f"Prompt A: {self.mean_a:.1%}\n"
          f"Prompt B: {self.mean_b:.1%}\n"
          f"Difference: {self.mean_difference:.1%} [95% CI: {self.ci_low:.1%}, {self.ci_high:.1%}]\n"
          f"\n"
          f"Effect size (Cohen's d): {self.effect_size:.3f} ({self.effect_size_interpretation})\n"
          f"n = {self.n_samples}"
      )

@dataclass
class ConfidenceInterval:
    """Confidence interval result."""
    estimate: float
    ci_low: float
    ci_high: float
    confidence_level: float
    method: str
    n_samples: int

    @property
    def margin_of_error(self):
        return (self.ci_high - self.ci_low) / 2

    def __repr__(self):
        return f"{self.estimate:.1%} [{self.ci_low:.1%}, {self.ci_high:.1%}] (n={self.n_samples})"


def bootstrap_ci(data, n_bootstrap=10000, alpha=0.05, random_state=42):
    data = np.asarray(data)
    n = len(data)
    rng = np.random.default_rng(random_state)
    boot_stats = np.array([np.mean(rng.choice(data, size=n, replace=True)) for _ in range(n_bootstrap)])
    return np.percentile(boot_stats, 100 * alpha / 2), np.percentile(boot_stats, 100 * (1 - alpha / 2))


def compute_ci(data, method="wilson", alpha=0.05, n_bootstrap=10000):
    data = np.asarray(data)
    n = len(data)
    if method == "wilson":
        p_hat = np.mean(data)
        z = stats.norm.ppf(1 - alpha / 2)
        denom = 1 + z**2 / n
        center = (p_hat + z**2 / (2 * n)) / denom
        margin = (z / denom) * np.sqrt(p_hat * (1 - p_hat) / n + z**2 / (4 * n**2))
        return center - margin, center + margin
    elif method == "bootstrap":
        return bootstrap_ci(data, n_bootstrap=n_bootstrap, alpha=alpha)


def cohens_d(scores_a, scores_b):
    diff = scores_a - scores_b
    std = np.std(diff, ddof=1)
    return np.mean(diff) / std if std > 0 else 0.0


def interpret_effect_size(d):
    d = abs(d)
    if d < 0.2: return "negligible"
    elif d < 0.5: return "small"
    elif d < 0.8: return "medium"
    else: return "large"


def compare_prompts(
    scores_a: Union[List[float], np.ndarray],
    scores_b: Union[List[float], np.ndarray],
    test: Literal["paired_t", "wilcoxon", "bootstrap", "mcnemar", "auto"] = "auto",
    alpha: float = 0.05,
    n_bootstrap: int = 10000,
) -> ComparisonResult:
    """
    Compare two sets of LLM evaluation scores.

    For binary scores (0/1), McNemar's test is most appropriate.
    """
    scores_a = np.asarray(scores_a)
    scores_b = np.asarray(scores_b)
    n = len(scores_a)

    # Check if binary
    is_binary = set(np.unique(scores_a)).issubset({0, 1}) and set(np.unique(scores_b)).issubset({0, 1})

    if test == "auto":
        if is_binary:
            test = "mcnemar"
        elif n < 20:
            test = "bootstrap"
        else:
            test = "wilcoxon"

    if test == "mcnemar":
        # McNemar's test for paired binary data
        # Count discordant pairs
        b = np.sum((scores_a == 1) & (scores_b == 0))  # A correct, B wrong
        c = np.sum((scores_a == 0) & (scores_b == 1))  # A wrong, B correct

        if b + c == 0:
            p_value = 1.0
            stat = 0.0
        else:
            # McNemar with continuity correction
            stat = (abs(b - c) - 1)**2 / (b + c)
            p_value = 1 - stats.chi2.cdf(stat, df=1)

        test_name = "McNemar's test"
        ci_low, ci_high = bootstrap_ci(scores_a - scores_b, n_bootstrap=n_bootstrap, alpha=alpha)

    elif test == "wilcoxon":
        diff = scores_a - scores_b
        if np.all(diff == 0):
            stat, p_value = 0.0, 1.0
        else:
            stat, p_value = stats.wilcoxon(scores_a, scores_b, zero_method='wilcox')
        test_name = "Wilcoxon signed-rank"
        ci_low, ci_high = bootstrap_ci(scores_a - scores_b, n_bootstrap=n_bootstrap, alpha=alpha)

    elif test == "paired_t":
        stat, p_value = stats.ttest_rel(scores_a, scores_b)
        test_name = "Paired t-test"
        diff = scores_a - scores_b
        se = stats.sem(diff)
        t_crit = stats.t.ppf(1 - alpha / 2, df=n - 1)
        ci_low, ci_high = np.mean(diff) - t_crit * se, np.mean(diff) + t_crit * se

    elif test == "bootstrap":
        diff = scores_a - scores_b
        observed_diff = np.mean(diff)
        centered_diff = diff - observed_diff
        rng = np.random.default_rng(42)
        boot_diffs = np.array([np.mean(rng.choice(centered_diff, size=n, replace=True)) for _ in range(n_bootstrap)])
        p_value = np.mean(np.abs(boot_diffs) >= np.abs(observed_diff))
        stat = observed_diff
        test_name = "Bootstrap"
        ci_low, ci_high = bootstrap_ci(diff, n_bootstrap=n_bootstrap, alpha=alpha)

    effect = cohens_d(scores_a, scores_b)
    effect_interp = interpret_effect_size(effect)

    return ComparisonResult(
        test_name=test_name,
        statistic=float(stat),
        p_value=float(p_value),
        significant=bool(p_value < alpha),
        effect_size=float(effect),
        effect_size_interpretation=effect_interp,
        ci_low=float(ci_low),
        ci_high=float(ci_high),
        mean_a=float(np.mean(scores_a)),
        mean_b=float(np.mean(scores_b)),
        mean_difference=float(np.mean(scores_a) - np.mean(scores_b)),
        n_samples=n,
        alpha=alpha,
    )


def metric_ci(scores, confidence=0.95):
    """Get confidence interval for accuracy."""
    scores = np.asarray(scores)
    alpha = 1 - confidence
    ci_low, ci_high = compute_ci(scores, method="wilson", alpha=alpha)
    return ConfidenceInterval(
        estimate=float(np.mean(scores)),
        ci_low=float(ci_low),
        ci_high=float(ci_high),
        confidence_level=confidence,
        method="wilson",
        n_samples=len(scores),
    )

## 4. Load MMLU

Pulling 30 questions across 8 subjects — a mix of high school and college level to keep it representative.

In [23]:
from datasets import load_dataset

# Choose subjects to sample from
SUBJECTS = [
    "high_school_mathematics",
    "high_school_physics",
    "high_school_chemistry",
    "high_school_biology",
    "high_school_computer_science",
    "college_mathematics",
    "college_physics",
    "machine_learning",
]

# Load questions from multiple subjects
N_EXAMPLES = 30
examples_per_subject = N_EXAMPLES // len(SUBJECTS) + 1

all_examples = []
for subject in SUBJECTS:
    try:
        ds = load_dataset("cais/mmlu", subject, split=f"test[:{examples_per_subject}]")
        for ex in ds:
            ex['subject'] = subject
            all_examples.append(ex)
    except Exception as e:
        print(f"Could not load {subject}: {e}")

# Take exactly N_EXAMPLES
dataset = all_examples[:N_EXAMPLES]

print(f"{len(dataset)} questions loaded")
print(f"\nSubjects included:")
from collections import Counter
subject_counts = Counter(ex['subject'] for ex in dataset)
for subj, count in subject_counts.items():
    print(f"  - {subj}: {count}")

30 questions loaded

Subjects included:
  - high_school_mathematics: 4
  - high_school_physics: 4
  - high_school_chemistry: 4
  - high_school_biology: 4
  - high_school_computer_science: 4
  - college_mathematics: 4
  - college_physics: 4
  - machine_learning: 2


In [24]:
# Sanity check — look at one question
ex = dataset[0]
print(f"Subject: {ex['subject']}")
print(f"Question: {ex['question']}")
for i, choice in enumerate(ex['choices']):
    label = chr(65 + i)  # A, B, C, D
    print(f"  {label}. {choice}")
print(f"Correct answer: {chr(65 + ex['answer'])}")

Subject: high_school_mathematics
Question: If a pentagon P with vertices at (– 2, – 4), (– 4, 1), (–1, 4), (2, 4), and (3, 0) is reflected across the line y = x to get a new pentagon, P’, then one of the vertices of P’ is
  A. (0, – 3)
  B. (4, 1)
  C. (2, 2)
  D. (– 4, –2)
Correct answer: D


## 5. The Two Prompts

Prompt A asks the model to reason before answering. Prompt B goes straight to the answer. Simple comparison - does chain-of-thought actually help here?


In [25]:
# The two prompts we're comparing

PROMPT_A = """Answer this multiple choice question. Think through it step by step, then give your final answer as a single letter (A, B, C, or D).

Question: {question}

A. {choice_a}
B. {choice_b}
C. {choice_c}
D. {choice_d}

Think step by step, then answer:"""

PROMPT_B = """Answer this multiple choice question with just the letter (A, B, C, or D).

Question: {question}

A. {choice_a}
B. {choice_b}
C. {choice_c}
D. {choice_d}

Answer:"""

print("Prompt A (with reasoning):")
print(PROMPT_A[:150] + "...")
print("\nPrompt B (direct answer):")
print(PROMPT_B[:150] + "...")

Prompt A (with reasoning):
Answer this multiple choice question. Think through it step by step, then give your final answer as a single letter (A, B, C, or D).

Question: {quest...

Prompt B (direct answer):
Answer this multiple choice question with just the letter (A, B, C, or D).

Question: {question}

A. {choice_a}
B. {choice_b}
C. {choice_c}
D. {choice...


## 6. Helper Functions


In [26]:
import re
from openai import OpenAI

client = OpenAI()

def format_prompt(template, example):
    """Format a prompt template with the question and choices."""
    return template.format(
        question=example['question'],
        choice_a=example['choices'][0],
        choice_b=example['choices'][1],
        choice_c=example['choices'][2],
        choice_d=example['choices'][3],
    )


def extract_answer(text):
    """
    Extract the answer letter (A, B, C, or D) from model response.
    Looks for patterns like "A", "(A)", "Answer: A", etc.
    """
    if not text:
        return None

    text = text.strip().upper()

    # Try to find explicit answer patterns first
    patterns = [
        r'(?:ANSWER|FINAL ANSWER)[:\s]*([ABCD])',
        r'(?:THE ANSWER IS)[:\s]*([ABCD])',
        r'\(([ABCD])\)',
        r'^([ABCD])[\.\)\s]',
        r'([ABCD])$',
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)

    # Last resort: find any A, B, C, D in the last part of response
    last_part = text[-50:] if len(text) > 50 else text
    for char in reversed(last_part):
        if char in 'ABCD':
            return char

    return None


def run_prompt(prompt_text, model="gpt-3.5-turbo"):
    """
    Run a prompt through the model and return the response.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt_text}],
            max_tokens=300,
            temperature=0,  # Deterministic for reproducibility
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Error: {e}")
        return None


def get_correct_letter(example):
    """Get the correct answer letter from the example."""
    return chr(65 + example['answer'])  # 0->A, 1->B, 2->C, 3->D


def score_response(response, correct_letter):
    """
    Score a response: 1 if correct, 0 if incorrect.
    """
    extracted = extract_answer(response)
    if extracted is None:
        return 0
    return 1 if extracted == correct_letter else 0

## 7. Run the Experiment

Same question, both prompts. Paired design — this matters for the test selection later.

Should take 2–3 minutes.

In [27]:
import time

# Store results
scores_a = []  # Reasoning prompt
scores_b = []  # Direct prompt
results_log = []  # Detailed log

print(f"Running {len(dataset)} questions...\n")

for i, example in enumerate(dataset):
    correct = get_correct_letter(example)

    # Format and run both prompts
    prompt_a_text = format_prompt(PROMPT_A, example)
    prompt_b_text = format_prompt(PROMPT_B, example)

    response_a = run_prompt(prompt_a_text)
    response_b = run_prompt(prompt_b_text)

    # Extract answers
    answer_a = extract_answer(response_a)
    answer_b = extract_answer(response_b)

    # Score responses
    score_a = score_response(response_a, correct)
    score_b = score_response(response_b, correct)

    scores_a.append(score_a)
    scores_b.append(score_b)

    # Log details
    results_log.append({
        "subject": example['subject'],
        "question": example['question'][:60] + "...",
        "correct": correct,
        "answer_a": answer_a,
        "answer_b": answer_b,
        "score_a": score_a,
        "score_b": score_b,
    })

    # Progress indicator
    subj_short = example['subject'][:15]
    print(f"[{i+1:2d}/{len(dataset)}] {subj_short:15} | A:{answer_a or '?'} B:{answer_b or '?'} (correct: {correct})")

    # Small delay to avoid rate limits
    time.sleep(0.5)

print(f"\nPrompt A (reasoning): {sum(scores_a)}/{len(scores_a)} correct ({sum(scores_a)/len(scores_a):.1%})")
print(f"Prompt B (direct):    {sum(scores_b)}/{len(scores_b)} correct ({sum(scores_b)/len(scores_b):.1%})")

Running 30 questions...

[ 1/30] high_school_mat | A:C B:C (correct: D)
[ 2/30] high_school_mat | A:C B:A (correct: C)
[ 3/30] high_school_mat | A:A B:B (correct: A)
[ 4/30] high_school_mat | A:C B:B (correct: B)
[ 5/30] high_school_phy | A:C B:B (correct: B)
[ 6/30] high_school_phy | A:B B:C (correct: A)
[ 7/30] high_school_phy | A:D B:D (correct: D)
[ 8/30] high_school_phy | A:B B:A (correct: C)
[ 9/30] high_school_che | A:A B:A (correct: A)
[10/30] high_school_che | A:A B:B (correct: A)
[11/30] high_school_che | A:C B:C (correct: C)
[12/30] high_school_che | A:B B:A (correct: A)
[13/30] high_school_bio | A:A B:A (correct: A)
[14/30] high_school_bio | A:C B:B (correct: A)
[15/30] high_school_bio | A:C B:C (correct: C)
[16/30] high_school_bio | A:A B:A (correct: A)
[17/30] high_school_com | A:C B:C (correct: C)
[18/30] high_school_com | A:A B:A (correct: A)
[19/30] high_school_com | A:A B:A (correct: A)
[20/30] high_school_com | A:B B:B (correct: C)
[21/30] college_mathema | A:B B:A (

## 8. Statistical Analysis

Binary scores (correct/incorrect) on paired data — McNemar's test is the right call here. The package picks this automatically.

In [28]:
# THE KEY QUESTION: Is Prompt A actually better than Prompt B?
result = compare_prompts(scores_a, scores_b)
print(result)


McNemar's test
--------------------------------------------------
p-value: 0.7237 (not significant at alpha=0.05)

Prompt A: 60.0%
Prompt B: 53.3%
Difference: 6.7% [95% CI: -13.3%, 23.3%]

Effect size (Cohen's d): 0.128 (negligible)
n = 30


## 9. Confidence Intervals

In [29]:
ci_a = metric_ci(scores_a)
ci_b = metric_ci(scores_b)

print("Individual Accuracy with 95% Confidence Intervals:")
print("="*55)
print(f"Prompt A (reasoning): {ci_a}")
print(f"Prompt B (direct):    {ci_b}")
print("="*55)

Individual Accuracy with 95% Confidence Intervals:
Prompt A (reasoning): 60.0% [42.3%, 75.4%] (n=30)
Prompt B (direct):    53.3% [36.1%, 69.8%] (n=30)


## 10. By Subject

In [30]:
# Break down by subject
print("\nAccuracy by Subject:")
print("="*70)
print(f"{'Subject':<35} | {'Prompt A':>10} | {'Prompt B':>10} | {'Diff':>8}")
print("-"*70)

from collections import defaultdict
subject_scores = defaultdict(lambda: {'a': [], 'b': []})

for log in results_log:
    subject_scores[log['subject']]['a'].append(log['score_a'])
    subject_scores[log['subject']]['b'].append(log['score_b'])

for subject, scores in sorted(subject_scores.items()):
    acc_a = sum(scores['a']) / len(scores['a'])
    acc_b = sum(scores['b']) / len(scores['b'])
    diff = acc_a - acc_b
    diff_str = f"+{diff:.0%}" if diff > 0 else f"{diff:.0%}"
    print(f"{subject:<35} | {acc_a:>10.0%} | {acc_b:>10.0%} | {diff_str:>8}")

print("="*70)


Accuracy by Subject:
Subject                             |   Prompt A |   Prompt B |     Diff
----------------------------------------------------------------------
college_mathematics                 |        50% |        25% |     +25%
college_physics                     |        75% |        50% |     +25%
high_school_biology                 |        75% |        75% |       0%
high_school_chemistry               |        75% |        75% |       0%
high_school_computer_science        |        75% |        75% |       0%
high_school_mathematics             |        50% |        25% |     +25%
high_school_physics                 |        25% |        50% |     -25%
machine_learning                    |        50% |        50% |       0%


## 11. Detailed Disagreements

In [31]:
# Show where the prompts disagreed
print("\nCases where prompts disagreed:")
print("="*70)

disagreements = 0
a_wins = 0
b_wins = 0

for i, log in enumerate(results_log):
    if log['score_a'] != log['score_b']:
        disagreements += 1
        if log['score_a'] > log['score_b']:
            winner = "A wins"
            a_wins += 1
        else:
            winner = "B wins"
            b_wins += 1
        print(f"Q{i+1} [{log['subject'][:20]}]")
        print(f"   {log['question']}")
        print(f"   Correct: {log['correct']} | A: {log['answer_a']} | B: {log['answer_b']} → {winner}")
        print()

ties = len(results_log) - disagreements
print(f"Summary:")
print(f"  - Prompt A wins: {a_wins}")
print(f"  - Prompt B wins: {b_wins}")
print(f"  - Ties (both right or both wrong): {ties}")


Cases where prompts disagreed:
Q2 [high_school_mathemat]
   The length of a rectangle is twice its width. Given the leng...
   Correct: C | A: C | B: A → A wins

Q3 [high_school_mathemat]
   A positive integer n is called “powerful” if, for every prim...
   Correct: A | A: A | B: B → A wins

Q4 [high_school_mathemat]
   At breakfast, lunch, and dinner, Joe randomly chooses with e...
   Correct: B | A: C | B: B → B wins

Q5 [high_school_physics]
   The plates of a capacitor are charged to a potential differe...
   Correct: B | A: C | B: B → B wins

Q10 [high_school_chemistr]
   Carbon has an atomic radius of 77 pm and a first ionization ...
   Correct: A | A: A | B: B → A wins

Q12 [high_school_chemistr]
   The net ionic equation expected when solutions of NH4Br and ...
   Correct: A | A: B | B: A → B wins

Q21 [college_mathematics]
   Let k be the number of real solutions of the equation e^x + ...
   Correct: B | A: B | B: A → A wins

Q25 [college_physics]
   The quantum efficiency of

## 12. Interpretation

In [32]:
acc_a = np.mean(scores_a)
acc_b = np.mean(scores_b)
diff = acc_a - acc_b

print(f"\nRaw accuracy difference: {diff:+.1%}")
print(f"  Prompt A (reasoning): {acc_a:.1%}")
print(f"  Prompt B (direct):    {acc_b:.1%}")

if result.significant:
    better = "A (reasoning)" if diff > 0 else "B (direct)"
    print(f"\nThe difference is statistically significant (p = {result.p_value:.4f})")
    print(f"Prompt {better} appears genuinely better — unlikely to be random chance.")
    print(f"Decision: Deploy prompt {better}.")
else:
    print(f"\nThe difference is not statistically significant (p = {result.p_value:.4f})")
    print(f"Can't confidently say one prompt is better. Could be noise.")
    print(f"With n={len(scores_a)}, might need more data to detect a real difference.")
    print(f"\nDecision: Default to the simpler/faster/cheaper prompt (B) unless you can collect more data.")

print(f"\nEffect size: {result.effect_size:.3f} ({result.effect_size_interpretation})")


Raw accuracy difference: +6.7%
  Prompt A (reasoning): 60.0%
  Prompt B (direct):    53.3%

The difference is not statistically significant (p = 0.7237)
Can't confidently say one prompt is better. Could be noise.
With n=30, might need more data to detect a real difference.

Decision: Default to the simpler/faster/cheaper prompt (B) unless you can collect more data.

Effect size: 0.128 (negligible)


## Takeaway

The raw accuracy gap might look meaningful at first glance, but the statistics tell you whether it actually is.

With p > 0.05 here, the 6.7% difference could easily be noise on 30 questions. You'd need a larger sample to say anything definitive.

**Why n=30?** This is deliberately small to demonstrate the point: eyeballing a 6.7% difference feels meaningful, but the test reveals we can't distinguish it from chance. In production, you'd run 100-300 samples — this notebook is meant to show the methodology, not be a definitive MMLU evaluation.

Want to tighten this up? Bump `N_EXAMPLES` to 100+ and rerun. Or use `sample_size()` from the rigor package to figure out exactly how many you need upfront.

**Note:** MMLU has clear right/wrong answers, which makes it easy to demonstrate the methodology. Real-world evaluation (RAG, open-ended generation, subjective quality) requires the same statistical rigor but with more complex scoring — that's where this approach becomes even more critical.
