In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import sys
import pickle
from pathlib import Path
from datasets import load_dataset
from transformers import AutoModel, ViTModel
from peft import LoraConfig, get_peft_model

sys.path.insert(0, "models")

DEVICE    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ARTIFACTS = Path("artifacts")
CACHE     = Path("../cache")
BACKEND   = Path("../backend/artifacts")

def normalized_edit_distance(s1, s2):
    if len(s1) == 0 and len(s2) == 0: return 0.0
    if len(s1) == 0 or  len(s2) == 0: return 1.0
    d = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    for i in range(len(s1) + 1): d[i][0] = i
    for j in range(len(s2) + 1): d[0][j] = j
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost)
    return d[len(s1)][len(s2)] / max(len(s1), len(s2))

print(f"Device: {DEVICE}")

Device: cuda


In [93]:
N = 100
ds = load_dataset("deepcopy/MathWriting-human")
ds_test = ds["val"].select(range(N))
gt_latex = [s["latex"] for s in ds_test]
print(f"Loaded {N} samples from validation split")
print(f"Available splits: {list(ds.keys())}")

Loaded 100 samples from validation split
Available splits: ['train', 'test', 'val']


## Eval 1 — DINOv2 with LoRa + LSTM (Bahdanau Attention)

In [94]:
from vit_lora_lstm_attn import ViTLatexModelLoRA as LSTMModel

# Tokenizer + special token indices (matches backend/app.py exactly)
with open(BACKEND / "latex_tokenizer256.pkl", "rb") as f:
    lstm_tok = pickle.load(f)
with open(BACKEND / "vocab_size.txt") as f:
    VOCAB_SIZE = int(f.read().strip())
LSTM_START = VOCAB_SIZE - 2  # 64
LSTM_END   = VOCAB_SIZE - 1  # 65
lstm_inv_vocab = {v: k for k, v in lstm_tok.word_index.items()}
print(f"vocab_size={VOCAB_SIZE}, START={LSTM_START}, END={LSTM_END}")

def lstm_decode(seq):
    return "".join(lstm_inv_vocab.get(t, "") for t in seq
                   if t not in (0, LSTM_START, LSTM_END))

# Load model
lstm_model = LSTMModel(vocab_size=VOCAB_SIZE, lora_r=16).to(DEVICE)
ckpt = torch.load(ARTIFACTS / "dinov2_attn_lora256.pt", map_location=DEVICE, weights_only=False)
lstm_model.load_state_dict(ckpt["model"])
lstm_model.eval()
print("LSTM model loaded.")

# Preprocess: RGB 224x224, /255, ImageNet norm (matches training)
MEAN = torch.tensor([0.485, 0.456, 0.406], device=DEVICE).view(1, 3, 1, 1)
STD  = torch.tensor([0.229, 0.224, 0.225], device=DEVICE).view(1, 3, 1, 1)

lstm_images = []
for s in ds_test:
    img = np.array(s["image"].convert("RGB").resize((224, 224)), dtype=np.uint8)
    lstm_images.append(img)
lstm_images = torch.from_numpy(np.array(lstm_images)).permute(0, 3, 1, 2)  # uint8 (N,3,224,224)
print(f"Preprocessed {N} images.")

vocab_size=66, START=64, END=65
LSTM model loaded.
Preprocessed 100 images.


In [95]:
exact = 0
total_ed = 0.0
print(f"Evaluating LSTM on {N} test samples...")
print("-" * 60)

for i in range(N):
    img = lstm_images[i:i+1].to(DEVICE, dtype=torch.float32) / 255.0
    img = (img - MEAN) / STD
    pred_tokens  = lstm_model.generate(img, max_len=150, sos_idx=LSTM_START, eos_idx=LSTM_END)
    prediction   = lstm_decode(pred_tokens)
    ground_truth = gt_latex[i]

    is_exact  = prediction == ground_truth
    edit_dist = normalized_edit_distance(prediction, ground_truth)
    if is_exact: exact += 1
    total_ed += edit_dist

    status = "EXACT" if is_exact else f"edit_dist={edit_dist:.4f}"
    print(f"  [{i+1:2d}/{N}] {status}")
    print(f"    GT:   {ground_truth[:80]}")
    print(f"    PRED: {prediction[:80]}")
    print("-" * 40)

print("=" * 60)
print(f"[LSTM] Exact match:         {exact/N:.2%} ({exact}/{N})")
print(f"[LSTM] Avg normalized edit: {total_ed/N:.4f}")

Evaluating LSTM on 100 test samples...
------------------------------------------------------------
  [ 1/100] edit_dist=0.0270
    GT:   \frac{\partial\psi}{\partial t}=P\psi
    PRED: \frac{\partial\psi}{\partial t}=p\psi
----------------------------------------
  [ 2/100] edit_dist=0.4706
    GT:   C_{0}=D/V_{blood}
    PRED: c_{0}=d/v_{66662}
----------------------------------------
  [ 3/100] edit_dist=0.5333
    GT:   e^{2}/\hbar c\approx1/137
    PRED: e^{2}/\hbar \tilde{\sigma}|/33
----------------------------------------
  [ 4/100] edit_dist=0.7727
    GT:   nassoc(S,\overline{S})
    PRED: na9500c(s,s)
----------------------------------------
  [ 5/100] edit_dist=0.5161
    GT:   Eq.1\frac{dV}{dx}=w
    PRED: e_{q}\cdot1\frac{dv}{dx}=\omega
----------------------------------------
  [ 6/100] EXACT
    GT:   \underline{r}
    PRED: \underline{r}
----------------------------------------
  [ 7/100] edit_dist=0.1818
    GT:   \hat{K}_{G}
    PRED: \hat{k}_{g}
--------------------

## Eval 2 — DINOv2 + Transfomer

In [None]:
# Setup paths relative to notebooks/
ARTIFACTS = Path("artifacts")
BACKEND = Path("../notebooks/artifacts")
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load Vocab and Tokenizer
with open(BACKEND / "vocab_size.txt", "w") as f:
    # Assuming vocab size is consistent with the training notebook
    VOCAB_SIZE = 66 

with open(BACKEND / "latex_tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

inv_vocab = {v: k for k, v in tokenizer.word_index.items()}
START_TOKEN = VOCAB_SIZE - 2
END_TOKEN = VOCAB_SIZE - 1

def decode(seq):
    filtered = [t for t in seq if t not in (0, START_TOKEN, END_TOKEN)]
    return "".join(inv_vocab.get(t, "") for t in filtered)

# 2. Initialize and Load Model
model = ViTLatexModelLoRA(vocab_size=VOCAB_SIZE).to(DEVICE)
# Referencing the specific weight file mentioned in your training notebook
checkpoint = torch.load(ARTIFACTS / "dinov2_attn_lora.pt", map_location=DEVICE)
model.load_state_dict(checkpoint["model"])
model.eval()

print(f"Model loaded and running on {DEVICE}")

# 3. Evaluation Loop
exact_matches = 0
total_edit_dist = 0.0
N = len(ds_test)

print(f"Evaluating Transformer on {N} samples...")
print("-" * 60)

# Preprocessing constants (ImageNet Norm)
MEAN = torch.tensor([0.485, 0.456, 0.406], device=DEVICE).view(1, 3, 1, 1)
STD  = torch.tensor([0.229, 0.224, 0.225], device=DEVICE).view(1, 3, 1, 1)

for i in range(N):
    # Prepare image (assuming tr_images contains uint8 tensors from previous cells)
    img = tr_images[i:i+1].to(DEVICE, dtype=torch.float32) / 255.0
    img = (img - MEAN) / STD
    
    # Generate
    pred_tokens = model.generate(img, max_len=150, sos_idx=START_TOKEN, eos_idx=END_TOKEN)
    
    prediction = decode(pred_tokens)
    ground_truth = gt_latex[i]
    
    is_exact = prediction == ground_truth
    edit_dist = normalized_edit_distance(prediction, ground_truth)
    
    if is_exact:
        exact_matches += 1
    total_edit_dist += edit_dist
    
    status = "EXACT" if is_exact else f"ED={edit_dist:.4f}"
    print(f"  [{i+1:2d}/{N}] {status}")
    print(f"    GT:   {ground_truth[:80]}")
    print(f"    PRED: {prediction[:80]}")
    print("-" * 40)

print("\n" + "=" * 60)
print(f"FINAL ACCURACY: {exact_matches/N:.2%} ({exact_matches}/{N})")
print(f"AVG EDIT DIST:  {total_edit_dist/N:.4f}")

FileNotFoundError: [Errno 2] No such file or directory: '..\\backend\\artifacts\\latex_tokenizer.pkl'