In [2]:
import torch
import numpy as np
import sys
import pickle
from pathlib import Path
from datasets import load_dataset

sys.path.insert(0, "models")

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ARTIFACTS = Path("artifacts")

def normalized_edit_distance(s1, s2):
    if len(s1) == 0 and len(s2) == 0: return 0.0
    if len(s1) == 0 or  len(s2) == 0: return 1.0
    d = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    for i in range(len(s1) + 1): d[i][0] = i
    for j in range(len(s2) + 1): d[0][j] = j
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)
    return d[len(s1)][len(s2)] / max(len(s1), len(s2))

print(f"Device: {DEVICE}")

Device: cpu


In [3]:
N = 100
ds = load_dataset("deepcopy/MathWriting-human")
ds_test = ds["val"].select(range(N))
gt_latex = [s["latex"] for s in ds_test]
print(f"Loaded {N} samples from validation split")
print(f"Available splits: {list(ds.keys())}")

Loaded 100 samples from validation split
Available splits: ['train', 'test', 'val']


In [4]:
from PIL import Image

def resize_pad_grayscale(img_pil: Image.Image, target=256, pad_value=255):
    """
    Resize keeping aspect ratio so longest side == target,
    then pad to (target, target) with white background.
    """
    img = img_pil.convert("L")
    w, h = img.size

    scale = target / max(w, h)
    new_w = max(1, int(round(w * scale)))
    new_h = max(1, int(round(h * scale)))

    img_rs = img.resize((new_w, new_h), resample=Image.BICUBIC)

    canvas = Image.new("L", (target, target), color=pad_value)
    left = (target - new_w) // 2
    top  = (target - new_h) // 2
    canvas.paste(img_rs, (left, top))

    return canvas

## Eval 1 — DINOv2 with LoRa + LSTM (Bahdanau Attention)

In [None]:
from vit_lora_lstm_attn import ViTLatexModelLoRA as LSTMModel
from PIL import Image

# Tokenizer + special token indices
with open(ARTIFACTS / "lstm_tokenizer.pkl", "rb") as f:
    lstm_tok = pickle.load(f)
VOCAB_SIZE = 66



LSTM_START = VOCAB_SIZE - 2
LSTM_END = VOCAB_SIZE - 1
lstm_inv_vocab = {v: k for k, v in lstm_tok.word_index.items()}
print(f"vocab_size={VOCAB_SIZE}, START={LSTM_START}, END={LSTM_END}")

def lstm_decode(seq):
    return "".join(lstm_inv_vocab.get(t, "") for t in seq
                   if t not in (0, LSTM_START, LSTM_END))

# Load model
lstm_model = LSTMModel(vocab_size=VOCAB_SIZE, lora_r=16).to(DEVICE)
ckpt = torch.load(ARTIFACTS / "lstm(2).pt", map_location=DEVICE, weights_only=False)
lstm_model.load_state_dict(ckpt["model"])
lstm_model.eval()
print("LSTM model loaded.")

# Preprocess: RGB 224x224, /255, ImageNet norm (matches training)
MEAN = torch.tensor([0.485, 0.456, 0.406], device=DEVICE).view(1, 3, 1, 1)
STD  = torch.tensor([0.229, 0.224, 0.225], device=DEVICE).view(1, 3, 1, 1)

lstm_images = []
for s in ds_test:
    # Match training: grayscale and 256x256
    img = resize_pad_grayscale(img, target=256)
    img = np.array(img, dtype=np.float32) / 255.0 # normalize
    
    lstm_images.append(img)

# (N,256,256) -> torch uint8 (N,1,256,256)
lstm_images = np.array(lstm_images)
lstm_images = lstm_images[..., np.newaxis]
lstm_images = torch.tensor(lstm_images, dtype=torch.float32).permute(0, 3, 1, 2)

#lstm_images = torch.tensor(np.array(lstm_images), dtype=torch.float32).unsqueeze(1)
# repeat channels: (N,1,256,256) -> (N,3,256,256)
print(f"Preprocessed {N} images.")

vocab_size=66, START=64, END=65
LSTM model loaded.
Preprocessed 100 images.


In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

latex_list = [s["latex"] for s in ds_test]

seqs = lstm_tok.texts_to_sequences(latex_list)
seqs = [[LSTM_START] + s + [LSTM_END] for s in seqs]

tokens_val = torch.tensor(
    pad_sequences(seqs, padding="post"),
    dtype=torch.long
)

In [None]:
exact = 0
total_ed = 0.0
print(f"Evaluating LSTM on {N} test samples...")
print("-" * 60)

print("START/END from tokenizer:", lstm_tok.word_index.get("<START>"), lstm_tok.word_index.get("<END>"))
print("START/END you're using:", LSTM_START, LSTM_END)

for i in range(N):
    img = lstm_images[i:i+1]
    img = img.repeat(1, 3, 1, 1).to(DEVICE)
    # img = (img - MEAN) / STD
    gt_token = tokens_val[i]
    pred_tokens  = lstm_model.generate_beam(img, max_len = 150, sos_idx=LSTM_START, eos_idx=LSTM_END, beam_size = 5)
    #pred_tokens = lstm_model.generate(img, max_len = 150, sos_idx=LSTM_START, eos_idx=LSTM_END)
    prediction   = lstm_decode(pred_tokens)
    ground_truth = lstm_decode(gt_token.tolist())

    is_exact  = prediction == ground_truth
    edit_dist = normalized_edit_distance(prediction, ground_truth)
    if is_exact: exact += 1
    total_ed += edit_dist

    status = "EXACT" if is_exact else f"edit_dist={edit_dist:.4f}"
    print(f"  [{i+1:2d}/{N}] {status}")
    print(f"    GT:   {ground_truth[:80]}")
    print(f"    PRED: {prediction[:80]}")
    print("-" * 40)

print("=" * 60)
print("LSTM results")
print(f"Exact match: {exact/N:.2%} ({exact}/{N})")
print(f"Avg normalized edit: {total_ed/N:.4f}")

Evaluating LSTM on 100 test samples...
------------------------------------------------------------
START/END from tokenizer: 64 65
START/END you're using: 64 65
  [ 1/100] EXACT
    GT:   \frac{\partial\psi}{\partial t}=p\psi
    PRED: \frac{\partial\psi}{\partial t}=p\psi
----------------------------------------
  [ 2/100] edit_dist=0.1765
    GT:   c_{0}=d/v_{blood}
    PRED: c_{0}=d/v_{6log}
----------------------------------------
  [ 3/100] EXACT
    GT:   e^{2}/\hbar c\approx1/137
    PRED: e^{2}/\hbar c\approx1/137
----------------------------------------
  [ 4/100] edit_dist=0.5000
    GT:   nassoc(s,\overline{s})
    PRED: nassoc(s,s)
----------------------------------------
  [ 5/100] edit_dist=0.4516
    GT:   eq.1\frac{dv}{dx}=w
    PRED: e_{q}\cdot1\frac{dv}{dx}=\omega
----------------------------------------
  [ 6/100] EXACT
    GT:   \underline{r}
    PRED: \underline{r}
----------------------------------------
  [ 7/100] EXACT
    GT:   \hat{k}_{g}
    PRED: \hat{k}_{g

## Eval 2 — DINOv2 + Transfomer

In [58]:
from vit_transformer_v2 import ViTLatexModelLoRA
import numpy as np

ARTIFACTS = Path("artifacts")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load Vocab and Tokenizer
VOCAB_SIZE = 66

with open(ARTIFACTS / "lstm_tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

inv_vocab = {v: k for k, v in tokenizer.word_index.items()}
START_TOKEN = VOCAB_SIZE - 2
END_TOKEN = VOCAB_SIZE - 1

def decode(seq):
    filtered = [t for t in seq if t not in (0, START_TOKEN, END_TOKEN)]
    return "".join(inv_vocab.get(t, "") for t in filtered)

# 2. Initialize and Load Model
model = ViTLatexModelLoRA(vocab_size=VOCAB_SIZE).to(DEVICE)
checkpoint = torch.load(ARTIFACTS / "transformer.pt", map_location=DEVICE, weights_only=False)
model.load_state_dict(checkpoint["model"])
model.eval()

print(f"Model loaded and running on {DEVICE}")

# 3. Evaluation Loop
exact_matches = 0
total_edit_dist = 0.0
N = len(ds_test)

print(f"Evaluating Transformer on {N} samples...")
print("-" * 60)

# Preprocessing constants (ImageNet Norm)
MEAN = torch.tensor([0.485, 0.456, 0.406], device=DEVICE).view(1, 3, 1, 1)
STD  = torch.tensor([0.229, 0.224, 0.225], device=DEVICE).view(1, 3, 1, 1)

for i in range(N):
    img = lstm_images[i:i+1]
    img = img.repeat(1, 3, 1, 1).to(DEVICE)
    
    with torch.no_grad():
        pred_tokens = model.generate(img, max_len=150, sos_idx=START_TOKEN, eos_idx=END_TOKEN)
    
    prediction = decode(pred_tokens)
    ground_truth = decode(tokens_val[i].tolist())
    
    is_exact = prediction == ground_truth
    edit_dist = normalized_edit_distance(prediction, ground_truth)
    
    if is_exact:
        exact_matches += 1
    total_edit_dist += edit_dist
    
    status = "EXACT" if is_exact else f"ED={edit_dist:.4f}"
    print(f"  [{i+1:2d}/{N}] {status}")
    print(f"    GT:   {ground_truth[:80]}")
    print(f"    PRED: {prediction[:80]}")
    print("-" * 40)

print("\n" + "=" * 60)
print("Transformer results")
print(f"Exact match: {exact_matches/N:.2%} ({exact_matches}/{N})")
print(f"Avg normalized edit: {total_edit_dist/N:.4f}")

Model loaded and running on cuda
Evaluating Transformer on 500 samples...
------------------------------------------------------------
  [ 1/500] EXACT
    GT:   \frac{\partial\psi}{\partial t}=p\psi
    PRED: \frac{\partial\psi}{\partial t}=p\psi
----------------------------------------
  [ 2/500] ED=0.3333
    GT:   c_{0}=d/v_{blood}
    PRED: c_{0}=d/v_{6106x})
----------------------------------------
  [ 3/500] EXACT
    GT:   e^{2}/\hbar c\approx1/137
    PRED: e^{2}/\hbar c\approx1/137
----------------------------------------
  [ 4/500] ED=0.5000
    GT:   nassoc(s,\overline{s})
    PRED: nassoc(s,s)
----------------------------------------
  [ 5/500] ED=0.4516
    GT:   eq.1\frac{dv}{dx}=w
    PRED: e_{q}\cdot1\frac{dv}{dx}=\omega
----------------------------------------
  [ 6/500] EXACT
    GT:   \underline{r}
    PRED: \underline{r}
----------------------------------------
  [ 7/500] EXACT
    GT:   \hat{k}_{g}
    PRED: \hat{k}_{g}
----------------------------------------
  [

## Eval 3 — GPT-5.2 Vision Baseline (Few-Shot)

In [59]:
import os, json, re, time, base64
from io import BytesIO
from PIL import Image, ImageOps, ImageFilter
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv("../.env")
api_key = os.getenv("OPENAI_API_KEY")
assert api_key, "Set OPENAI_API_KEY in ../.env before running."
client = OpenAI(api_key=api_key)

MODEL = "gpt-5.2"
NUM_FEW_SHOT = 8
GPT_N = 500

# --- LaTeX normalization (applied to BOTH GT and prediction) ---
def normalize_latex(s):
    """Normalize LaTeX for fairer comparison."""
    s = s.strip().lower()
    s = re.sub(r'^\$+|\$+$', '', s)
    s = re.sub(r'\\begin\{equation\}|\\end\{equation\}', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    s = re.sub(r'\s*([{}()^_])\s*', r'\1', s)
    s = re.sub(r'(\\[a-zA-Z]+)\s+\{', r'\1{', s)
    s = s.replace('\\left', '').replace('\\right', '')
    s = re.sub(r'\\[,;:!]', '', s)
    return s

def preprocess_for_gpt(pil_image, target_size=512):
    """Clean up handwriting image for better GPT vision recognition."""
    img = pil_image.convert("L")
    arr = np.array(img)
    threshold = 200
    arr = np.where(arr < threshold, 0, 255).astype(np.uint8)
    img = Image.fromarray(arr)
    inv = ImageOps.invert(img)
    bbox = inv.getbbox()
    if bbox:
        padding = 20
        x0 = max(0, bbox[0] - padding)
        y0 = max(0, bbox[1] - padding)
        x1 = min(img.width, bbox[2] + padding)
        y1 = min(img.height, bbox[3] + padding)
        img = img.crop((x0, y0, x1, y1))
    w, h = img.size
    side = max(w, h)
    padded = Image.new("L", (side, side), 255)
    padded.paste(img, ((side - w) // 2, (side - h) // 2))
    padded = padded.resize((target_size, target_size), Image.LANCZOS)
    return padded.convert("RGB")

def encode_image(pil_image):
    buf = BytesIO()
    pil_image.save(buf, format="PNG")
    return base64.b64encode(buf.getvalue()).decode("utf-8")

# --- Pick diverse few-shot examples (spread across train set) ---
train_len = len(ds["train"])
few_shot_indices = [int(i * train_len / NUM_FEW_SHOT) for i in range(NUM_FEW_SHOT)]
few_shot_samples = [ds["train"][i] for i in few_shot_indices]
few_shot_b64 = [
    (encode_image(preprocess_for_gpt(s["image"])), normalize_latex(s["latex"]))
    for s in few_shot_samples
]

print("Few-shot examples:")
for idx, (_, latex) in zip(few_shot_indices, few_shot_b64):
    print(f"  [{idx}] {latex[:80]}")
print()

SYSTEM_PROMPT = (
    "You are an expert at reading handwritten mathematical equations and converting them to LaTeX. "
    "Return ONLY the raw LaTeX string, nothing else — no dollar signs, no code fences, no explanation. "
    "IMPORTANT RULES: "
    "1. Output everything in lowercase. "
    "2. Do NOT use \\left or \\right — just use bare ( ) [ ] delimiters. "
    "3. Do NOT use \\mathbb, \\operatorname, \\mathrm, \\text, or \\lvert/\\rvert. "
    "4. Do NOT add spacing commands like \\, \\; \\: \\! or \\quad. "
    "5. Use simple notation: x_1 not x_{1} when subscript is a single character. "
    "6. Use \\frac{a}{b} for fractions, not a/b or \\dfrac. "
    "7. Pay very careful attention to each character — distinguish similar-looking symbols "
    "(e.g., w vs \\omega, v vs \\nu, p vs \\rho, x vs \\times). "
    "8. If you see a plain letter, output the plain letter (w, not \\omega). "
    "9. Match the EXACT style of the examples provided."
)

def strip_code_fences(text):
    return re.sub(r"^```(?:latex)?\n?|\n?```$", "", text, flags=re.MULTILINE).strip()

def predict_latex(pil_image):
    processed = preprocess_for_gpt(pil_image)
    messages = []
    for b64, latex in few_shot_b64:
        messages.append({
            "role": "user",
            "content": [
                {"type": "input_text", "text": "Convert this handwritten equation to LaTeX:"},
                {"type": "input_image", "image_url": f"data:image/png;base64,{b64}"},
            ],
        })
        messages.append({"role": "assistant", "content": latex})
    b64_test = encode_image(processed)
    messages.append({
        "role": "user",
        "content": [
            {"type": "input_text", "text": "Convert this handwritten equation to LaTeX:"},
            {"type": "input_image", "image_url": f"data:image/png;base64,{b64_test}"},
        ],
    })
    response = client.responses.create(
        model=MODEL,
        instructions=SYSTEM_PROMPT,
        input=messages,
        temperature=0,
    )
    return strip_code_fences(response.output_text.strip())

# --- Use tokenizer round-trip for GT (same as LSTM/Transformer evals) ---
gpt_gt_normalized = []
for i in range(GPT_N):
    gt_roundtrip = lstm_decode(tokens_val[i].tolist())
    gpt_gt_normalized.append(normalize_latex(gt_roundtrip))

gpt_exact = 0
gpt_edit = 0.0

print(f"Evaluating {MODEL} ({NUM_FEW_SHOT}-shot) on {GPT_N} val samples...")
print(f"Using LaTeX normalization + tokenizer GT round-trip")
print("-" * 60)

for i in range(GPT_N):
    ground_truth = gpt_gt_normalized[i]
    try:
        raw_pred = predict_latex(ds_test[i]["image"])
        prediction = normalize_latex(raw_pred)
    except Exception as e:
        print(f"  [{i+1}/{GPT_N}] ERROR: {e}")
        prediction = ""

    is_exact = prediction == ground_truth
    edit_dist = normalized_edit_distance(prediction, ground_truth)
    if is_exact: gpt_exact += 1
    gpt_edit += edit_dist

    status = "EXACT" if is_exact else f"edit_dist={edit_dist:.4f}"
    print(f"  [{i+1:2d}/{GPT_N}] {status}")
    print(f"    GT:   {ground_truth[:80]}")
    print(f"    PRED: {prediction[:80]}")
    print("-" * 40)

    if i < GPT_N - 1:
        time.sleep(0.5)

print("=" * 60)
print(f"{MODEL} results ({NUM_FEW_SHOT}-shot, normalized)")
print(f"Exact match: {gpt_exact/GPT_N:.2%} ({gpt_exact}/{GPT_N})")
print(f"Avg normalized edit: {gpt_edit/GPT_N:.4f}")

Few-shot examples:
  [0] v(\tilde{\beta})
  [28733] \int vds
  [57466] d=\frac{a\pm\sqrt{a^{2}-b^{2}c}}{2}
  [86199] q=\int_{v}j^{0}dv
  [114932] \frac{dy_{1}}{dt}=-ay_{1}
  [143665] \frac{\partial s^{*}(p)}{\partial p}
  [172398] \frac{v_{e}}{c}=0,00009935
  [201131] \kappa(a,z):=\overline{f_{a}(z)}

Evaluating gpt-5.2 (8-shot) on 500 val samples...
Using LaTeX normalization + tokenizer GT round-trip
------------------------------------------------------------
  [ 1/500] EXACT
    GT:   \frac{\partial\psi}{\partial t}=p\psi
    PRED: \frac{\partial\psi}{\partial t}=p\psi
----------------------------------------
  [ 2/500] edit_dist=0.4706
    GT:   c_{0}=d/v_{blood}
    PRED: c_{0}=d/\sqrt{6}
----------------------------------------
  [ 3/500] edit_dist=0.4390
    GT:   e^{2}/\hbar c\approx1/137
    PRED: \frac{e^{2}}{\hbar c}\approx\frac{1}{137}
----------------------------------------
  [ 4/500] edit_dist=0.5455
    GT:   nassoc(s,\overline{s})
    PRED: massoc(s,s)
----------------