# GPT-5 Vision Baseline
Evaluate GPT-5's vision capabilities on handwritten math to LaTeX conversion.

**Setup on Kaggle:** Add your OpenAI API key as a Kaggle secret named `OPENAI_API_KEY`.

In [None]:
!pip install -q openai datasets

In [None]:
import os
import json
import re
import time
import base64
import random
from io import BytesIO

from openai import OpenAI
from datasets import load_dataset

# On Kaggle, use Kaggle Secrets for the API key
# Go to Add-ons > Secrets > add OPENAI_API_KEY
try:
    from kaggle_secrets import UserSecretsClient
    api_key = UserSecretsClient().get_secret("OPENAI_API_KEY")
except Exception:
    api_key = os.getenv("OPENAI_API_KEY")

assert api_key, "OPENAI_API_KEY not found. Add it as a Kaggle secret or env var."
client = OpenAI(api_key=api_key)
print("OpenAI client ready.")

In [None]:
# config
NUM_SAMPLES = 50
MODEL = "gpt-5"
DELAY_BETWEEN_CALLS = 0.5
SEED = 42

## 1. Load dataset

In [None]:
ds = load_dataset("deepcopy/MathWriting-human", split="test")
print(f"Test set size: {len(ds)}")

random.seed(SEED)
indices = random.sample(range(len(ds)), NUM_SAMPLES)
samples = [ds[i] for i in indices]
print(f"Sampled {NUM_SAMPLES} test images (seed={SEED})")

In [None]:
# preview a few samples
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for ax, s in zip(axes, samples[:5]):
    ax.imshow(s["image"], cmap="gray")
    ax.set_title(s["latex"][:30], fontsize=9)
    ax.axis("off")
plt.tight_layout()
plt.show()

## 2. Helpers

In [None]:
PROMPT = (
    "This image contains a handwritten mathematical equation or expression. "
    "Return ONLY the LaTeX markup that represents this equation. "
    "Do not include dollar signs, \\begin{equation}, or any other wrapper. "
    "Return just the raw LaTeX string, nothing else."
)


def encode_image_to_base64(pil_image):
    buf = BytesIO()
    pil_image.save(buf, format="PNG")
    return base64.b64encode(buf.getvalue()).decode("utf-8")


def strip_code_fences(text):
    return re.sub(r"^```(?:latex)?\n?|\n?```$", "", text, flags=re.MULTILINE).strip()


def predict_latex(pil_image):
    b64 = encode_image_to_base64(pil_image)
    response = client.responses.create(
        model=MODEL,
        input=[
            {
                "role": "user",
                "content": [
                    {"type": "input_text", "text": PROMPT},
                    {
                        "type": "input_image",
                        "image_url": f"data:image/png;base64,{b64}",
                    },
                ],
            }
        ],
    )
    return strip_code_fences(response.output_text.strip())


def normalized_edit_distance(pred, target):
    m, n = len(pred), len(target)
    if m == 0 and n == 0:
        return 0.0
    dp = list(range(n + 1))
    for i in range(1, m + 1):
        prev = dp[0]
        dp[0] = i
        for j in range(1, n + 1):
            temp = dp[j]
            if pred[i - 1] == target[j - 1]:
                dp[j] = prev
            else:
                dp[j] = 1 + min(dp[j], dp[j - 1], prev)
            prev = temp
    return dp[n] / max(m, n)

## 3. Evaluation

In [None]:
results = []
exact_matches = 0
total_edit_dist = 0.0

for i, sample in enumerate(samples):
    ground_truth = sample["latex"]
    image = sample["image"]

    try:
        prediction = predict_latex(image)
    except Exception as e:
        print(f"  [{i+1}/{NUM_SAMPLES}] ERROR: {e}")
        prediction = ""

    is_exact = prediction == ground_truth
    edit_dist = normalized_edit_distance(prediction, ground_truth)

    if is_exact:
        exact_matches += 1
    total_edit_dist += edit_dist

    results.append({
        "index": indices[i],
        "ground_truth": ground_truth,
        "prediction": prediction,
        "exact_match": is_exact,
        "normalized_edit_distance": round(edit_dist, 4),
    })

    status = "EXACT" if is_exact else f"edit_dist={edit_dist:.4f}"
    print(f"  [{i+1}/{NUM_SAMPLES}] {status}")
    print(f"    GT:   {ground_truth[:80]}")
    print(f"    Pred: {prediction[:80]}")

    if i < NUM_SAMPLES - 1:
        time.sleep(DELAY_BETWEEN_CALLS)

In [None]:
# summary
accuracy = exact_matches / NUM_SAMPLES
avg_edit_dist = total_edit_dist / NUM_SAMPLES

print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Model:                    {MODEL}")
print(f"Samples:                  {NUM_SAMPLES}")
print(f"Exact match accuracy:     {accuracy:.2%} ({exact_matches}/{NUM_SAMPLES})")
print(f"Avg normalized edit dist: {avg_edit_dist:.4f}")

# save results
output = {
    "summary": {
        "model": MODEL,
        "num_samples": NUM_SAMPLES,
        "seed": SEED,
        "exact_match_accuracy": round(accuracy, 4),
        "avg_normalized_edit_distance": round(avg_edit_dist, 4),
    },
    "results": results,
}

with open("gpt5_baseline_results.json", "w") as f:
    json.dump(output, f, indent=2)

print(f"\nResults saved to gpt5_baseline_results.json")