# NMT Homework (Self-Contained): EN→DE

Train a translation model (English→German), measure perplexity and BLEU, save a checkpoint, and optionally export predictions for ML‑Arena.

Focus: experiment with architectures (LSTM w/ attention, Transformer, decoding strategies) — not boilerplate. Core evaluation functions are provided to ensure consistent scoring across students.

Data: the course staff provides `dataset_splits/` in the repo root. No additional setup is needed for data.

## 0. Setup
Use `install.sh` or `pip install -r requirements.txt` to set up.

In [3]:
import torch, sys, os, math, random
print('PyTorch version:', torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
try: sys.stdout.reconfigure(line_buffering=True)
except Exception: pass

PyTorch version: 2.6.0+cu124
Using device: cuda


## 1. Shared Utilities (no external imports)
Tokenization, vocabulary, dataset, collate, and fixed evaluation (PPL, NLL, BLEU).

In [4]:
from typing import List, Tuple, Dict, Iterable
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

def set_seed(seed: int = 42):
    random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

SPECIAL_TOKENS = {'pad': '<pad>', 'sos': '<sos>', 'eos': '<eos>', 'unk': '<unk>'}

def simple_tokenize(s: str) -> List[str]:
    return s.strip().lower().split()

def read_split(path: str) -> List[Tuple[List[str], List[str]]]:
    pairs: List[Tuple[List[str], List[str]]] = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.rstrip('').split('	')
            if len(parts) < 2: continue
            pairs.append((simple_tokenize(parts[0]), simple_tokenize(parts[1])))
    return pairs

def build_vocab(seqs: Iterable[List[str]], max_size: int | None = None) -> Dict[str, int]:
    from collections import Counter
    c = Counter();
    for s in seqs: c.update(s)
    itms = c.most_common(max_size) if max_size else c.items()
    stoi = {SPECIAL_TOKENS['pad']:0, SPECIAL_TOKENS['sos']:1, SPECIAL_TOKENS['eos']:2, SPECIAL_TOKENS['unk']:3}
    for w,_ in itms:
        if w not in stoi: stoi[w] = len(stoi)
    return stoi

def encode(tokens: List[str], stoi: Dict[str,int], add_sos_eos: bool=False) -> List[int]:
    ids = [stoi.get(t, stoi[SPECIAL_TOKENS['unk']]) for t in tokens]
    if add_sos_eos: ids = [stoi[SPECIAL_TOKENS['sos']]] + ids + [stoi[SPECIAL_TOKENS['eos']] ]
    return ids

class Example:
    def __init__(self, s: List[int], ti: List[int], to: List[int]): self.src_ids=s; self.tgt_in_ids=ti; self.tgt_out_ids=to
class TranslationDataset(Dataset):
    def __init__(self, pairs, src_stoi, tgt_stoi):
        self.examples: List[Example] = []
        for src, tgt in pairs:
            s = encode(src, src_stoi) + [src_stoi[SPECIAL_TOKENS['eos']]]
            t = encode(tgt, tgt_stoi, add_sos_eos=True)
            self.examples.append(Example(s, t[:-1], t[1:]))
    def __len__(self): return len(self.examples)
    def __getitem__(self, i): return self.examples[i]

def collate_pad(batch, pad_id_src: int, pad_id_tgt: int):
    src_max = max(len(x.src_ids) for x in batch); tgt_max = max(len(x.tgt_in_ids) for x in batch)
    def pad_to(a, L, pad): return a + [pad]*(L-len(a))
    src    = torch.tensor([pad_to(x.src_ids,    src_max, pad_id_src) for x in batch])
    tgt_in = torch.tensor([pad_to(x.tgt_in_ids, tgt_max, pad_id_tgt) for x in batch])
    tgt_out= torch.tensor([pad_to(x.tgt_out_ids,tgt_max, pad_id_tgt) for x in batch])
    src_l  = torch.tensor([len(x.src_ids)    for x in batch])
    tgt_l  = torch.tensor([len(x.tgt_out_ids)for x in batch])
    return src, src_l, tgt_in, tgt_out, tgt_l

def compute_perplexity(loss_sum: float, token_count: int) -> float:
    if token_count==0: return float('inf')
    try: return float(math.exp(loss_sum/token_count))
    except OverflowError: return float('inf')

def corpus_bleu(refs: List[List[str]], hyps: List[List[str]], max_order: int=4, smooth: bool=True) -> float:
    from collections import Counter
    def ngrams(t,n): return Counter([tuple(t[i:i+n]) for i in range(len(t)-n+1)])
    m=[0]*max_order; p=[0]*max_order; rl=0; hl=0
    for r,h in zip(refs,hyps):
        rl+=len(r)
        hl+=len(h)
        for n in range(1,max_order+1):
            R=ngrams(r,n); H=ngrams(h,n);
            m[n-1]+=sum(min(c,H[g]) for g,c in R.items()); p[n-1]+=max(len(h)-n+1,0)
    prec=[(m[i]+1)/(p[i]+1) if smooth else (m[i]/p[i] if p[i]>0 else 0.0) for i in range(max_order)]
    geo=math.exp(sum((1/max_order)*math.log(x) for x in prec if x>0)) if min(prec)>0 else 0.0
    bp=1.0 if hl>rl else math.exp(1-rl/max(1,hl))
    return float(geo*bp)

@torch.no_grad()
def evaluate_nll(loader: DataLoader, model: nn.Module, pad_id_tgt: int, device: torch.device):
    criterion = nn.CrossEntropyLoss(ignore_index=pad_id_tgt, reduction='sum')
    model.eval(); tot=0.0; toks=0
    for src,src_l,tgt_in,tgt_out,tgt_l in loader:
        src,src_l = src.to(device), src_l.to(device)
        tgt_in,tgt_out = tgt_in.to(device), tgt_out.to(device)
        logits = model(src, src_l, tgt_in)
        loss = criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
        tot+=float(loss.item()); toks+=int((tgt_out!=pad_id_tgt).sum().item())
    return tot, toks

@torch.no_grad()
def evaluate_bleu(loader: DataLoader, model: nn.Module, tgt_itos: List[str], sos_id: int, eos_id: int, device: torch.device, max_len: int=100):
    model.eval(); refs=[]; hyps=[]
    for src,src_l,tgt_in,tgt_out,tgt_l in loader:
        src,src_l = src.to(device), src_l.to(device)
        pred = model.greedy_decode(src, src_l, max_len=max_len, sos_id=sos_id, eos_id=eos_id)
        for b in range(src.size(0)):
            ref_ids = tgt_out[b].tolist(); hyp_ids = pred[b].tolist()
            if eos_id in ref_ids: ref_ids = ref_ids[:ref_ids.index(eos_id)]
            if eos_id in hyp_ids: hyp_ids = hyp_ids[:hyp_ids.index(eos_id)]
            refs.append([tgt_itos[i] for i in ref_ids if i!=0])
            hyps.append([tgt_itos[i] for i in hyp_ids if i!=0 and i!=sos_id])
    return float(corpus_bleu(refs, hyps))


## 2. Paths and Hyperparameters

In [5]:
set_seed(42)
train_path = '/kaggle/input/translation/train.txt'
val_path   = '/kaggle/input/translation/val.txt'
public_test_path = '/kaggle/input/translation/public_test.txt'
if not os.path.exists(public_test_path):
    alt = 'dataset_splits/test_public.txt'
    public_test_path = alt if os.path.exists(alt) else public_test_path
private_test_path = 'dataset_splits/private_test.txt'
src_vocab_size = 30000; tgt_vocab_size = 30000
emb_dim = 256; hid_dim = 512; layers = 1; dropout = 0.1
batch_size = 64; epochs = 5; lr = 3e-4; max_decode_len = 100
save_dir = 'checkpoints'; os.makedirs(save_dir, exist_ok=True)
print('Public test path:', public_test_path)

Public test path: /kaggle/input/translation/public_test.txt


## 3. Load Data and Build Vocab

In [6]:
print('Loading splits...')
train_pairs = read_split(train_path); val_pairs = read_split(val_path); test_pairs = read_split(public_test_path)
print(f'Train: {len(train_pairs):,} | Val: {len(val_pairs):,} | Public test: {len(test_pairs):,}')
src_stoi = build_vocab((s for s,_ in train_pairs), max_size=src_vocab_size)
tgt_stoi = build_vocab((t for _,t in train_pairs), max_size=tgt_vocab_size)
pad_id_src = src_stoi[SPECIAL_TOKENS['pad']]; pad_id_tgt = tgt_stoi[SPECIAL_TOKENS['pad']]
sos_id = tgt_stoi[SPECIAL_TOKENS['sos']]; eos_id = tgt_stoi[SPECIAL_TOKENS['eos']]
train_ds = TranslationDataset(train_pairs, src_stoi, tgt_stoi); val_ds = TranslationDataset(val_pairs, src_stoi, tgt_stoi); test_ds = TranslationDataset(test_pairs, src_stoi, tgt_stoi)
collate = lambda b: collate_pad(b, pad_id_src, pad_id_tgt)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  collate_fn=collate, num_workers=0)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, collate_fn=collate, num_workers=0)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, collate_fn=collate, num_workers=0)
tgt_itos = [None]*len(tgt_stoi);
for w,i in tgt_stoi.items():
    if 0<=i<len(tgt_itos): tgt_itos[i]=w
print('Vocab sizes — src:', len(src_stoi), 'tgt:', len(tgt_stoi))

Loading splits...
Train: 226,997 | Val: 32,428 | Public test: 32,428
Vocab sizes — src: 30004 tgt: 30004


## 4. Build Model (Your Playground)
Keep the forward/greedy_decode contract so evaluation works. Try adding attention, GRU, Transformer, etc.

In [8]:
!cp "/kaggle/input/translator/pytorch/default/1/model_prova.py" .

from model_prova import Encoder, Decoder, Seq2Seq
import sys
import os

class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        self.v = nn.Linear(hid_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs, mask=None):
        B, S, H = encoder_outputs.size()
        hidden = hidden.unsqueeze(1).repeat(1, S, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        if mask is not None:
            attention = attention.masked_fill(mask == 0, -1e10)
        return torch.nn.functional.softmax(attention, dim=1)

class Encoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers=1, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.rnn = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers>1 else 0.0)

    def forward(self, src, src_lens):
        emb = self.emb(src)
        packed = nn.utils.rnn.pack_padded_sequence(emb, src_lens.cpu(), batch_first=True, enforce_sorted=False)
        out, (h, c) = self.rnn(packed)
        out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        return out, (h, c)

class Decoder(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers=1, dropout=0.1):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.attention = Attention(hid_dim)
        self.rnn = nn.LSTM(emb_dim + hid_dim, hid_dim, num_layers=num_layers, batch_first=True, dropout=dropout if num_layers>1 else 0.0)
        self.proj = nn.Linear(emb_dim + hid_dim * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt_in, hidden, encoder_outputs, mask=None):
        B, T = tgt_in.size()
        emb = self.dropout(self.emb(tgt_in))
        outputs = []
        for t in range(T):
            emb_t = emb[:, t:t+1, :]
            h_t = hidden[0][-1]
            attn_weights = self.attention(h_t, encoder_outputs, mask)
            context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
            rnn_input = torch.cat((emb_t, context), dim=2)
            out, hidden = self.rnn(rnn_input, hidden)
            proj_input = torch.cat((emb_t, context, out), dim=2)
            output = self.proj(proj_input)
            outputs.append(output)
        return torch.cat(outputs, dim=1), hidden

class Seq2Seq(nn.Module):
    def __init__(self, enc, dec):
        super().__init__()
        self.encoder = enc
        self.decoder = dec

    def create_mask(self, src, src_lens):
        B, S = src.size()
        mask = torch.zeros(B, S, device=src.device)
        for i, length in enumerate(src_lens):
            mask[i, :length] = 1
        return mask

    def forward(self, src, src_lens, tgt_in):
        encoder_outputs, hidden = self.encoder(src, src_lens)
        mask = self.create_mask(src, src_lens)
        logits, _ = self.decoder(tgt_in, hidden, encoder_outputs, mask)
        return logits

    @torch.no_grad()
    def greedy_decode(self, src, src_lens, max_len, sos_id, eos_id):
        B = src.size(0)
        encoder_outputs, hidden = self.encoder(src, src_lens)
        mask = self.create_mask(src, src_lens)
        decoder_input = torch.full((B, 1), sos_id, dtype=torch.long, device=src.device)
        outputs = []
        for _ in range(max_len):
            logits, hidden = self.decoder(decoder_input, hidden, encoder_outputs, mask)
            next_token = logits[:, -1, :].argmax(-1, keepdim=True)
            outputs.append(next_token)
            decoder_input = next_token
        seqs = torch.cat(outputs, dim=1)
        for i in range(B):
            row = seqs[i]
            if (row == eos_id).any():
                idx = (row == eos_id).nonzero(as_tuple=False)[0].item()
                row[idx+1:] = eos_id
        return seqs

encoder = Encoder(len(src_stoi), emb_dim, hid_dim, num_layers=layers, dropout=dropout)
decoder = Decoder(len(tgt_stoi), emb_dim, hid_dim, num_layers=layers, dropout=dropout)
model = Seq2Seq(encoder, decoder).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
print(f"Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Total parameters: 58,524,980


## 5. Train

In [9]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_id_tgt, reduction='sum')
for epoch in range(1, epochs+1):
    model.train(); tot=0.0; toks=0
    for src,src_l,tgt_in,tgt_out,tgt_l in train_loader:
        src,src_l=src.to(device), src_l.to(device); tgt_in,tgt_out=tgt_in.to(device), tgt_out.to(device)
        optimizer.zero_grad(); logits=model(src, src_l, tgt_in)
        loss=criterion(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1)); loss.backward();
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0); optimizer.step()
        tot+=float(loss.item()); toks+=int((tgt_out!=pad_id_tgt).sum().item())
    tr_ppl=compute_perplexity(tot,toks); v_loss,v_toks=evaluate_nll(val_loader, model, pad_id_tgt, device); v_ppl=compute_perplexity(v_loss,v_toks)
    print(f'Epoch {epoch:02d} | train ppl: {tr_ppl:.2f} | val ppl: {v_ppl:.2f}')
torch.save({'model_state': model.state_dict(), 'optimizer_state': optimizer.state_dict(), 'epoch': epochs, 'src_stoi': src_stoi, 'tgt_stoi': tgt_stoi, 'model_cfg': {'emb': emb_dim, 'hid': hid_dim, 'layers': layers, 'dropout': dropout}}, os.path.join(save_dir, 'checkpoint_last.pt'))
print('Saved checkpoint:', os.path.join(save_dir, 'checkpoint_last.pt'))

Epoch 01 | train ppl: 36.56 | val ppl: 12.04
Epoch 02 | train ppl: 7.34 | val ppl: 7.10
Epoch 03 | train ppl: 4.08 | val ppl: 5.81
Epoch 04 | train ppl: 2.93 | val ppl: 5.32
Epoch 05 | train ppl: 2.37 | val ppl: 5.14
Saved checkpoint: checkpoints/checkpoint_last.pt


## 6. Evaluate: Perplexity, BLEU and ROUGE (Public Test)

In [10]:
def _lcs_len(x: List[str], y: List[str]) -> int:
    m, n = len(x), len(y)
    if m == 0 or n == 0:
        return 0
    dp = [0] * (n + 1)
    for i in range(1, m + 1):
        prev = 0
        for j in range(1, n + 1):
            tmp = dp[j]
            if x[i - 1] == y[j - 1]:
                dp[j] = prev + 1
            else:
                dp[j] = max(dp[j], dp[j - 1])
            prev = tmp
    return dp[n]

def corpus_rouge_l(refs: List[List[str]], hyps: List[List[str]]) -> float:
    scores = []
    for r, h in zip(refs, hyps):
        if not r or not h:
            scores.append(0.0)
            continue
        L = _lcs_len(r, h)
        prec = L / len(h)
        rec  = L / len(r)
        if prec + rec == 0:
            f1 = 0.0
        else:
            f1 = 2 * prec * rec / (prec + rec)
        scores.append(f1)
    return float(sum(scores) / len(scores)) if scores else 0.0

@torch.no_grad()
def evaluate_rouge(loader: DataLoader, model: nn.Module, tgt_itos,
                   sos_id: int, eos_id: int, device: torch.device,
                   max_len: int = 100) -> float:
    model.eval()
    refs, hyps = [], []
    for src, src_l, tgt_in, tgt_out, tgt_l in loader:
        src, src_l = src.to(device), src_l.to(device)
        pred = model.greedy_decode(src, src_l, max_len=max_len,
                                   sos_id=sos_id, eos_id=eos_id)
        for b in range(src.size(0)):
            ref_ids = tgt_out[b].tolist()
            hyp_ids = pred[b].tolist()
            if eos_id in ref_ids:
                ref_ids = ref_ids[:ref_ids.index(eos_id)]
            if eos_id in hyp_ids:
                hyp_ids = hyp_ids[:hyp_ids.index(eos_id)]
            refs.append([tgt_itos[i] for i in ref_ids if i != 0])
            hyps.append([tgt_itos[i] for i in hyp_ids if i != 0 and i != sos_id])
    return corpus_rouge_l(refs, hyps)

val_loss, val_tok = evaluate_nll(val_loader, model, pad_id_tgt, device)
val_ppl = compute_perplexity(val_loss, val_tok)

tst_loss, tst_tok = evaluate_nll(test_loader, model, pad_id_tgt, device)
tst_ppl = compute_perplexity(tst_loss, tst_tok)

bleu = evaluate_bleu(test_loader, model, tgt_itos,
                     sos_id=sos_id, eos_id=eos_id,
                     device=device, max_len=max_decode_len)

rouge = evaluate_rouge(test_loader, model, tgt_itos,
                       sos_id=sos_id, eos_id=eos_id,
                       device=device, max_len=max_decode_len)

print(f'Validation perplexity: {val_ppl:.2f}')
print(f'Public test perplexity: {tst_ppl:.2f}')
print(f'Public test BLEU:       {bleu*100:.2f}')
print(f'Public test ROUGE-L:    {rouge*100:.2f}')

Validation perplexity: 5.14
Public test perplexity: 5.09
Public test BLEU:       30.45
Public test ROUGE-L:    57.82


## 7. Output Examples

In [11]:
print("\n" + "="*80)
print("Showing translation examples from the test set")
print("="*80 + "\n")

import random
random.seed(42)
num_examples = 10
sample_indices = random.sample(range(len(test_pairs)), num_examples)

model.eval()

for idx, sample_idx in enumerate(sample_indices, 1):
    src_tokens, tgt_tokens = test_pairs[sample_idx]
    
    src_ids = encode(src_tokens, src_stoi) + [eos_id]
    src_tensor = torch.tensor([src_ids], device=device)
    src_lens = torch.tensor([len(src_ids)], device=device)
    
    with torch.no_grad():
        pred_ids = model.greedy_decode(
            src_tensor, src_lens, 
            max_len=max_decode_len, 
            sos_id=sos_id, 
            eos_id=eos_id
        )[0].tolist()
    
    if eos_id in pred_ids:
        pred_ids = pred_ids[:pred_ids.index(eos_id)]
    pred_tokens = [tgt_itos[i] for i in pred_ids if i != 0 and i != sos_id]
    
    src_text = ' '.join(src_tokens)
    ref_text = ' '.join(tgt_tokens)
    pred_text = ' '.join(pred_tokens)
    
    print(f"[Example {idx}]")
    print(f"Source (EN):     {src_text}")
    print(f"Reference (DE):  {ref_text}")
    print(f"Prediction (DE): {pred_text}")
    print("-" * 80)


Showing translation examples from the test set

[Example 1]
Source (EN):     this book is hers.
Reference (DE):  dieses buch ist das ihrige.
Prediction (DE): dieses buch ist ihrs.
--------------------------------------------------------------------------------
[Example 2]
Source (EN):     would you like a brandy?
Reference (DE):  möchtet ihr einen brandy?
Prediction (DE): möchtest du einen brandy?
--------------------------------------------------------------------------------
[Example 3]
Source (EN):     i've never slept in class.
Reference (DE):  ich habe niemals im unterricht geschlafen.
Prediction (DE): ich habe noch nie in der klasse gesprochen.
--------------------------------------------------------------------------------
[Example 4]
Source (EN):     i'm ready to leave this place.
Reference (DE):  ich bin bereit, von hier zu verschwinden.
Prediction (DE): ich bin bereit, das zu verlassen.
--------------------------------------------------------------------------------
[Example

## 8. Private Test (Optional)

In [None]:
if os.path.exists(private_test_path):
    prv_pairs = read_split(private_test_path); prv_ds = TranslationDataset(prv_pairs, src_stoi, tgt_stoi)
    prv_loader = DataLoader(prv_ds, batch_size=batch_size, shuffle=False, collate_fn=collate, num_workers=0)
    prv_loss, prv_tok = evaluate_nll(prv_loader, model, pad_id_tgt, device); prv_ppl = compute_perplexity(prv_loss, prv_tok)
    prv_bleu = evaluate_bleu(prv_loader, model, tgt_itos, sos_id=sos_id, eos_id=eos_id, device=device, max_len=max_decode_len)
    print(f'Private test perplexity: {prv_ppl:.2f}')
    print(f'Private test BLEU:       {prv_bleu*100:.2f}')
else:
    print('Private test split not found at', private_test_path)

## 9. Export Predictions for ML‑Arena (Optional)

In [None]:
@torch.no_grad()
def decode_to_lines(loader: DataLoader, model: nn.Module, tgt_itos: List[str], sos_id: int, eos_id: int, device: torch.device, max_len: int) -> List[str]:
    lines: List[str] = []
    for src,src_l,tgt_in,tgt_out,tgt_l in loader:
        src,src_l = src.to(device), src_l.to(device)
        pred_ids = model.greedy_decode(src, src_l, max_len=max_len, sos_id=sos_id, eos_id=eos_id)
        for b in range(src.size(0)):
            hyp = pred_ids[b].tolist()
            if eos_id in hyp: hyp = hyp[:hyp.index(eos_id)]
            toks = [tgt_itos[i] for i in hyp if i != 0 and i != sos_id]
            lines.append(' '.join(toks))
    return lines
export_split = 'private'; export_format = 'tsv'; export_out = 'submissions/private_predictions.tsv'
os.makedirs(os.path.dirname(export_out) or '.', exist_ok=True)
pairs = read_split(public_test_path if export_split=='public' else private_test_path)
exp_ds = TranslationDataset(pairs, src_stoi, tgt_stoi); exp_loader = DataLoader(exp_ds, batch_size=batch_size, shuffle=False, collate_fn=collate, num_workers=0)
preds = decode_to_lines(exp_loader, model, tgt_itos, sos_id=sos_id, eos_id=eos_id, device=device, max_len=max_decode_len)
if export_format=='tsv':
    with open(export_out, 'w', encoding='utf-8') as f:
        for i,h in enumerate(preds): f.write(f'{i}	{h}')
elif export_format=='jsonl':
    import json
    with open(export_out, 'w', encoding='utf-8') as f:
        for i,h in enumerate(preds): f.write(json.dumps({'id': i, 'hyp': h}, ensure_ascii=False)+'')
print(f'Wrote {len(preds)} predictions to {export_out}')
print('Adjust if ML‑Arena requires a different schema.')