# ViT Encoder + LSTM Decoder Training
Frozen `facebook/dinov2-base` encoder with a trainable LSTM decoder for handwritten math → LaTeX.

In [1]:
!pip install -q transformers datasets
!pip -q install peft accelerate

In [2]:
import numpy as np
import pickle
import random
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchvision import transforms
from transformers import ViTModel
from pathlib import Path
from datasets import load_from_disk, load_dataset
from peft import LoraConfig, get_peft_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

2026-02-18 20:50:52.935911: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771447853.254386      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771447853.342348      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771447854.133412      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771447854.133476      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771447854.133479      55 computation_placer.cc:177] computation placer alr

Using device: cuda


## 1. Load & preprocess dataset

In [None]:
for f in sorted(Path("/kaggle/input/datasets").rglob("*")):
    print(f)

In [3]:
print("Loading dataset...")
ds = load_dataset("deepcopy/MathWriting-human")
num_samples = 40000
ds_train = ds["train"].select(range(num_samples))

# 2. Pre-allocate Image Array (Saves RAM by avoiding copies)
# 50,000 * 256 * 256 * 4 bytes = ~13.1 GB
print(f"Pre-allocating memory for {num_samples} images...")
images_array = np.zeros((num_samples, 256, 256), dtype=np.uint8) # Uses uint8 instead of float to save RAM
latex_strings = []

# 3. Process Images & Collect Strings
print("Processing images and LaTeX strings...")
for i in range(num_samples):
    sample = ds_train[i]
    # Convert and resize directly into the array
    img = sample["image"].convert("L").resize((256, 256))
    images_array[i] = np.array(img, dtype=np.uint8)
    latex_strings.append(sample["latex"])
    
    if (i + 1) % 5000 == 0:
        print(f"Progress: {i + 1}/{num_samples}")

# 4. Setup Tokenizer
print("Fitting tokenizer...")
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(latex_strings)

# Add special tokens
tokenizer.word_index["<START>"] = len(tokenizer.word_index) + 1
tokenizer.word_index["<END>"] = len(tokenizer.word_index) + 1
tokenizer.index_word[tokenizer.word_index["<START>"]] = "<START>"
tokenizer.index_word[tokenizer.word_index["<END>"]] = "<END>"

START_ID = tokenizer.word_index["<START>"]
END_ID   = tokenizer.word_index["<END>"]

# 5. Sequence Padding
print("Tokenizing and padding sequences...")
sequences = tokenizer.texts_to_sequences(latex_strings)
sequences = [[START_ID] + seq + [END_ID] for seq in sequences]
padded_sequences = pad_sequences(sequences, padding="post")

# 6. Save Tokenizer and Vocab Info
print("Saving metadata...")
with open("/kaggle/working/latex_tokenizer256.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

vocab_size = len(tokenizer.word_index) + 1
with open("/kaggle/working/vocab_size.txt", "w") as f:
    f.write(str(vocab_size))

# 7. Convert to Tensors and Save (Disk usage check: ~13.5GB total)
print("Converting to Tensors...")
# torch.from_numpy avoids a RAM copy
images_tensor = torch.from_numpy(images_array).unsqueeze(1) 
tokens_tensor = torch.tensor(padded_sequences, dtype=torch.long)

print("Saving tensors to disk (this takes a minute)...")
torch.save(images_tensor, "/kaggle/working/images_train256.pt")
torch.save(tokens_tensor, "/kaggle/working/tokens_train256.pt")

print("Done!")
print(f"Final Vocab Size: {vocab_size}")
print(f"Image Tensor Shape: {images_tensor.shape}")




Loading dataset...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00003-ab0ae6b9fa4a3f(…):   0%|          | 0.00/373M [00:00<?, ?B/s]

data/train-00001-of-00003-589d2b65116e09(…):   0%|          | 0.00/374M [00:00<?, ?B/s]

data/train-00002-of-00003-42472859069c07(…):   0%|          | 0.00/373M [00:00<?, ?B/s]

data/test-00000-of-00001-694f317d8b63419(…):   0%|          | 0.00/44.9M [00:00<?, ?B/s]

data/val-00000-of-00001-184984e66f80ed7a(…):   0%|          | 0.00/81.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/229864 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7644 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/15674 [00:00<?, ? examples/s]

Pre-allocating memory for 40000 images...
Processing images and LaTeX strings...
Progress: 5000/40000
Progress: 10000/40000
Progress: 15000/40000
Progress: 20000/40000
Progress: 25000/40000
Progress: 30000/40000
Progress: 35000/40000
Progress: 40000/40000
Fitting tokenizer...
Tokenizing and padding sequences...
Saving metadata...
Converting to Tensors...
Saving tensors to disk (this takes a minute)...
Done!
Final Vocab Size: 66
Image Tensor Shape: torch.Size([40000, 1, 256, 256])


In [4]:
ds = load_dataset("deepcopy/MathWriting-human")

ds_val = ds["val"].select(range(5000))

images, sequences = [], []

def preprocess_image(img, target_size=(256, 256)):
    img = img.convert("L")  # convert to grayscale
    img = img.resize(target_size)
    img = np.array(img) / 255.0  # normalize to [0, 1]
    return img

for sample in ds_val:
    img = preprocess_image(sample["image"])
    images.append(img)
    sequences.append(sample["latex"])

images = np.array(images)
with open("/kaggle/working/latex_tokenizer256.pkl", "rb") as f:
    tokenizer = pickle.load(f)

START_ID = tokenizer.word_index["<START>"]
END_ID   = tokenizer.word_index["<END>"]

seqs = tokenizer.texts_to_sequences(sequences)
seqs = [[START_ID] + s + [END_ID] for s in seqs]

padded_sequences = pad_sequences(seqs, padding="post")
images = images[..., np.newaxis]

images_tensor = torch.tensor(images, dtype=torch.float32).permute(0, 3, 1, 2)
tokens_tensor = torch.tensor(padded_sequences, dtype=torch.long)

torch.save(images_tensor, "/kaggle/working/images_val256.pt")
torch.save(tokens_tensor, "/kaggle/working/tokens_val256.pt")
print(1)

1


## 2. Model definition

In [None]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, encoder_dim=768):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.enc_to_h = nn.Linear(encoder_dim, hidden_dim)

    def forward(self, x, encoder_features=None, hidden_state=None):
        x = self.embedding(x)
        if hidden_state is None:
            if encoder_features is not None:
                h0 = torch.tanh(self.enc_to_h(encoder_features)).unsqueeze(0)
                c0 = torch.zeros_like(h0)
                output, hidden = self.lstm(x, (h0, c0))
            else:
                output, hidden = self.lstm(x)
        else:
            output, hidden = self.lstm(x, hidden_state)
        logits = self.fc(output)
        return logits, hidden

In [5]:
class BahdanauAttention(nn.Module):
    """
    Additive attention:
      score(h, e_i) = v^T tanh(W_h h + W_e e_i)
    """
    def __init__(self, hidden_dim, encoder_dim, attn_dim=None):
        super().__init__()
        if attn_dim is None:
            attn_dim = hidden_dim
        self.W_h = nn.Linear(hidden_dim, attn_dim, bias=False)
        self.W_e = nn.Linear(encoder_dim, attn_dim, bias=False)
        self.v   = nn.Linear(attn_dim, 1, bias=False)

    def forward(self, h, enc_mem, enc_mask=None):
        """
        h:        (B, H)
        enc_mem:  (B, N, D)
        enc_mask: (B, N) with 1 for valid, 0 for pad (optional)

        returns:
          context: (B, D)
          alpha:   (B, N)
        """
        # (B, 1, A) + (B, N, A) -> (B, N, A)
        energy = torch.tanh(self.W_h(h).unsqueeze(1) + self.W_e(enc_mem))
        scores = self.v(energy).squeeze(-1)  # (B, N)

        if enc_mask is not None:
            scores = scores.masked_fill(enc_mask == 0, -1e9)

        alpha = F.softmax(scores, dim=-1)    # (B, N)
        context = torch.bmm(alpha.unsqueeze(1), enc_mem).squeeze(1)  # (B, D)
        return context, alpha


class AttnDecoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, encoder_dim=768, dropout=0.1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.attn = BahdanauAttention(hidden_dim, encoder_dim, attn_dim=hidden_dim)

        # Step-wise LSTM so we can inject attention context every timestep
        self.lstm_cell = nn.LSTMCell(embed_dim + encoder_dim, hidden_dim)

        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

        # Optional: init decoder hidden state from encoder (pooled)
        self.enc_to_h = nn.Linear(encoder_dim, hidden_dim)
        self.enc_to_c = nn.Linear(encoder_dim, hidden_dim)

    def init_hidden(self, enc_mem):
        """
        enc_mem: (B, N, D)
        """
        pooled = enc_mem.mean(dim=1)  # (B, D)
        h0 = torch.tanh(self.enc_to_h(pooled))
        c0 = torch.tanh(self.enc_to_c(pooled))
        return (h0, c0)

    def forward(self, x, enc_mem=None, hidden_state=None, enc_mask=None):
        """
        x:       (B, T) token ids (teacher forcing input sequence)
        enc_mem: (B, N, D) ViT patch tokens (memory)
        hidden_state: (h, c) each (B, H) or None

        returns:
          logits: (B, T, vocab)
          hidden: (h, c)
        """
        B, T = x.shape
        emb = self.dropout(self.embedding(x))  # (B, T, E)

        if hidden_state is None:
            if enc_mem is None:
                # fallback: start from zeros
                h = torch.zeros(B, self.lstm_cell.hidden_size, device=x.device)
                c = torch.zeros_like(h)
            else:
                h, c = self.init_hidden(enc_mem)
        else:
            h, c = hidden_state

        logits_steps = []

        for t in range(T):
            if enc_mem is not None:
                context, _ = self.attn(h, enc_mem, enc_mask=enc_mask)  # (B, D)
            else:
                # no encoder: zero context
                context = torch.zeros(B, self.enc_to_h.in_features, device=x.device)

            lstm_in = torch.cat([emb[:, t, :], context], dim=-1)  # (B, E+D)
            h, c = self.lstm_cell(lstm_in, (h, c))
            logits_steps.append(self.fc(h).unsqueeze(1))  # (B, 1, vocab)

        logits = torch.cat(logits_steps, dim=1)  # (B, T, vocab)
        return logits, (h, c)


In [None]:
class ViTLatexModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512):
        super().__init__()
        from transformers import AutoModel
        self.encoder = AutoModel.from_pretrained("facebook/dinov2-base")
        print(self.encoder.config.hidden_size)
        for param in self.encoder.parameters():
            param.requires_grad = False
        encoder_dim = self.encoder.config.hidden_size  # 768
        self.decoder = AttnDecoder(vocab_size, embed_dim, hidden_dim, encoder_dim)
        #self.decoder = Decoder(vocab_size, embed_dim, hidden_dim, encoder_dim)
    
    def forward(self, images, targets):
        enc = self.encoder(images).last_hidden_state  # (B, 1+N, D)
        enc_mem = enc[:, 1:, :]                       # (B, N, D)  <-- ATTENTION MEMORY

        logits, _ = self.decoder(targets, enc_mem=enc_mem)
        return logits

    
    @torch.no_grad()
    def generate(self, image, max_len=100, sos_idx=1, eos_idx=2):
        self.eval()
        enc = self.encoder(image).last_hidden_state    # (B, 1+N, D)
        enc_mem = enc[:, 1:, :]                         # (B, N, D)

        token = torch.tensor([[sos_idx]], device=image.device)
        output_tokens = []
        hidden = None

        for _ in range(max_len):
            logits, hidden = self.decoder(token, enc_mem=enc_mem, hidden_state=hidden)
            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)

            if next_token.item() == eos_idx:
                break

            output_tokens.append(next_token.item())
            token = next_token

        return output_tokens
        
        # self.eval()
        # encoder_out = self.encoder(image).last_hidden_state[:, 0, :]
        # token = torch.tensor([[sos_idx]], device=image.device)
        # output_tokens = []
        # hidden = None
        # for i in range(max_len):
        #     if i == 0:
        #         logits, hidden = self.decoder(token, encoder_features=encoder_out)
        #     else:
        #         logits, hidden = self.decoder(token, hidden_state=hidden)
        #     next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)
        #     if next_token.item() == eos_idx:
        #         break
        #     output_tokens.append(next_token.item())
        #     token = next_token
        # return output_tokens

In [None]:
def list_linear_suffixes(model):
    suffixes = set()
    examples = []
    for name, mod in model.named_modules():
        if isinstance(mod, nn.Linear):
            suf = name.split(".")[-1]
            suffixes.add(suf)
            if len(examples) < 30:
                examples.append(name)
    print("Unique Linear suffixes:", sorted(suffixes))
    print("\nExample Linear module names:")
    for e in examples:
        print("  ", e)

In [6]:
class ViTLatexModelLoRA(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, lora_r=16, lora_alpha=32, lora_dropout=0.05):
        super().__init__()
        from transformers import AutoModel
        self.encoder = AutoModel.from_pretrained("facebook/dinov2-base")

        # IMPORTANT: do NOT freeze everything after LoRA; LoRA will keep base frozen + adapters trainable.
        # (Freezing before LoRA is fine, but not necessary.)

        # Choose target modules after you inspect names.
        # Start with common ViT block names:
        target_modules = ["query", "key", "value", "dense", "fc1", "fc2"]
        
        lora_cfg = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            bias="none",
            target_modules=target_modules,
            task_type="FEATURE_EXTRACTION",  # safe default for encoder-only usage
        )
        self.encoder = get_peft_model(self.encoder, lora_cfg)

        encoder_dim = self.encoder.config.hidden_size  # 768
        self.decoder = AttnDecoder(vocab_size, embed_dim, hidden_dim, encoder_dim)

    def forward(self, images, input_tokens):
        enc = self.encoder(pixel_values=images).last_hidden_state
        enc_mem = enc[:, 1:, :]  # (B, N, D)
        logits, _ = self.decoder(input_tokens, enc_mem=enc_mem)
        return logits

    @torch.no_grad()
    def generate(self, image, max_len=150, sos_idx=1, eos_idx=2):
        self.eval()

        # image: (1, 3, H, W) already normalized in your eval loop
        enc = self.encoder(pixel_values=image).last_hidden_state
        enc_mem = enc[:, 1:, :]  # (1, N, D)

        token = torch.tensor([[sos_idx]], device=image.device)
        output_tokens = []
        hidden = None

        for _ in range(max_len):
            logits, hidden = self.decoder(token, enc_mem=enc_mem, hidden_state=hidden)
            next_token = logits[:, -1, :].argmax(dim=-1, keepdim=True)  # (1,1)

            if next_token.item() == eos_idx:
                break

            output_tokens.append(next_token.item())
            token = next_token

        return output_tokens

## 3. Training

In [None]:
# Load vocab size
with open("/kaggle/working/vocab_size.txt") as f:
    VOCAB_SIZE = int(f.read().strip())

# Hyperparameters
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 1e-3

# Load pre-processed tensors (just load, don't convert yet)
images_tensor = torch.load("/kaggle/working/images_train256.pt")  # (40000, 1, 256, 256)
tokens_tensor = torch.load("/kaggle/working/tokens_train256.pt")  # (40000, seq_len)

print(f"Images: {images_tensor.shape}, Tokens: {tokens_tensor.shape}, Vocab size: {VOCAB_SIZE}")

# Create dataset and loader
dataset = TensorDataset(images_tensor, tokens_tensor)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Initialize model
model = ViTLatexModelLoRA(vocab_size=VOCAB_SIZE).to(DEVICE)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / {total:,}")

criterion = nn.CrossEntropyLoss(ignore_index=0)
#optimizer = torch.optim.Adam(model.decoder.parameters(), lr=LEARNING_RATE)

optimizer = torch.optim.AdamW(
    [
        {"params": model.decoder.parameters(), "lr": 1e-3},
        {"params": model.encoder.parameters(), "lr": 1e-4},  # LoRA adapters
    ],
    weight_decay=0.01
)

mean = torch.tensor([0.485, 0.456, 0.406], device=DEVICE).view(1,3,1,1)
std  = torch.tensor([0.229, 0.224, 0.225], device=DEVICE).view(1,3,1,1)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch_idx, (imgs, seqs) in enumerate(loader):
        # Convert grayscale to RGB
        imgs = imgs.float() / 255.0
        imgs = imgs.repeat(1, 3, 1, 1).to(DEVICE)  # (B, 1, 256, 256) -> (B, 3, 256, 256)
        imgs = (imgs - mean) / std
        
        seqs = seqs.to(DEVICE)
        
        # Teacher forcing
        input_tokens = seqs[:, :-1]   # (B, seq_len-1)
        target_tokens = seqs[:, 1:]   # (B, seq_len-1)
        
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(imgs, input_tokens)  # (B, seq_len-1, vocab_size)
        
        # Compute loss
        loss = criterion(
            logits.reshape(-1, VOCAB_SIZE),  # (B * (seq_len-1), vocab_size)
            target_tokens.reshape(-1)         # (B * (seq_len-1))
        )
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        
        total_loss += loss.item()
        
        # Print every 100 batches
        if batch_idx % 100 == 0:
            print(f"  Batch {batch_idx}/{len(loader)} | Loss: {loss.item():.4f}")
    
    print(f"Epoch {epoch + 1}/{EPOCHS} | Avg Loss: {total_loss / len(loader):.4f}")

# Save model
#SAVE_PATH = "/kaggle/working/dinov2_model256.pt"
SAVE_PATH = "/kaggle/working/dinov2_attn_lora256.pt"
torch.save({
    "model": model.state_dict()
}, SAVE_PATH)

print(f"Model saved to {SAVE_PATH}")

## 4. Evals

In [7]:
def normalized_edit_distance(s1, s2):
    if len(s1) == 0 and len(s2) == 0:
        return 0.0
    if len(s1) == 0 or len(s2) == 0:
        return 1.0
    
    # Levenshtein distance
    d = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]
    for i in range(len(s1) + 1):
        d[i][0] = i
    for j in range(len(s2) + 1):
        d[0][j] = j
    
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            d[i][j] = min(d[i-1][j] + 1, d[i][j-1] + 1, d[i-1][j-1] + cost)
    
    return d[len(s1)][len(s2)] / max(len(s1), len(s2))

In [8]:
import torch
from pickle import load

DATA = "/kaggle/input/datasets/ronaldtran/run1-output"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with open(f"/kaggle/working/vocab_size.txt") as f:
    VOCAB_SIZE = int(f.read().strip())

START_TOKEN = VOCAB_SIZE - 2
END_TOKEN = VOCAB_SIZE - 1
MAX_LEN = 150

# Load model
model = ViTLatexModelLoRA(vocab_size=VOCAB_SIZE).to(DEVICE)
checkpoint = torch.load(f"{DATA}/dinov2_attn_lora256.pt", map_location=DEVICE)
model.load_state_dict(checkpoint["model"])
model.eval()

# Load validation data
images = torch.load(f"/kaggle/working/images_val256.pt")
tokens = torch.load(f"/kaggle/working/tokens_val256.pt")

with open(f"/kaggle/working/latex_tokenizer256.pkl", "rb") as f:
    tokenizer = load(f)

inv_vocab = {v: k for k, v in tokenizer.word_index.items()}

def decode(seq):
    # Filter out start and end tokens
    filtered = [t for t in seq if t != START_TOKEN and t != END_TOKEN and t != 0]
    return "".join(inv_vocab.get(t, "") for t in filtered)

# Inference
N = 500
exact_matches = 0
total_edit_dist = 0.0

print(f"Evaluating on {N} test samples...")
print("-" * 60)

for i in range(N):
    img = images[i:i+1]  # (1, 1, 256, 256)
    img = img.repeat(1, 3, 1, 1).to(DEVICE)  # (1, 3, 256, 256)
    gt_tokens = tokens[i]
    
    pred_tokens = model.generate(img, max_len=MAX_LEN, sos_idx=START_TOKEN, eos_idx=END_TOKEN)
    
    ground_truth = decode(gt_tokens.tolist())
    prediction = decode(pred_tokens)
    
    is_exact = prediction == ground_truth
    edit_dist = normalized_edit_distance(prediction, ground_truth)
    
    if is_exact:
        exact_matches += 1
    total_edit_dist += edit_dist
    
    status = "EXACT" if is_exact else f"edit_dist={edit_dist:.4f}"
    print(f"  [{i+1}/{N}] {status}")
    print(f"    GT:   {ground_truth[:80]}")
    print(f"    PRED: {prediction[:80]}")
    print("-"*40)

accuracy = exact_matches / N
avg_edit_dist = total_edit_dist / N

print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"Model:                    DinoV2 + LSTM")
print(f"Samples:                  {N}")
print(f"Exact match accuracy:     {accuracy:.2%} ({exact_matches}/{N})")
print(f"Avg normalized edit dist: {avg_edit_dist:.4f}")

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Evaluating on 500 test samples...
------------------------------------------------------------
  [1/500] EXACT
    GT:   \frac{\partial\psi}{\partial t}=p\psi
    PRED: \frac{\partial\psi}{\partial t}=p\psi
----------------------------------------
  [2/500] edit_dist=0.1765
    GT:   c_{0}=d/v_{blood}
    PRED: c_{0}=d/v_{6log2}
----------------------------------------
  [3/500] edit_dist=0.5667
    GT:   e^{2}/\hbar c\approx1/137
    PRED: e^{2}/\hbar \tilde{\sigma}||3|
----------------------------------------
  [4/500] edit_dist=0.6364
    GT:   nassoc(s,\overline{s})
    PRED: nagscos(s,s)
----------------------------------------
  [5/500] edit_dist=0.4839
    GT:   eq.1\frac{dv}{dx}=w
    PRED: e_{2}\cdot1\frac{dv}{dx}=\omega
----------------------------------------
  [6/500] EXACT
    GT:   \underline{r}
    PRED: \underline{r}
----------------------------------------
  [7/500] EXACT
    GT:   \hat{k}_{g}
    PRED: \hat{k}_{g}
----------------------------------------
  [8/500] edi

In [15]:
import os
print(os.listdir("/kaggle/input/datasets/ronaldtran/run1-output"))

['dinov2_model256.pt', 'dinov2_model256AttnLstm.pt', 'dinov2_attn_lora256.pt']
