In [1]:
import os
import re
import unicodedata
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# --- CONFIGURATION ---
TEST_PATH = "/kaggle/input/deep-past-initiative-machine-translation/test.csv"
MODEL_PATH = "/kaggle/input/byt5-small/transformers/default/1" 

BATCH_SIZE = 20   # Safe inference batch size
MAX_LENGTH = 384  # Must match training length
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- CLEANING FUNCTION (Must match training!) ---
def clean_text(text):
    if pd.isna(text): return ""
    text = str(text)
    text = unicodedata.normalize('NFC', text)
    text = text.replace("…", "...")
    text = re.sub(r'\.\s*\.\s*\.', '...', text)
    text = text.replace('“', '"').replace('”', '"')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print(f"Loading model from {MODEL_PATH}...")

tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/byt5-small/transformers/default/1', local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained('/kaggle/input/byt5-small/transformers/default/1', local_files_only=True)
model = model.to(DEVICE)
model.eval()


Loading model from /kaggle/input/byt5-small/transformers/default/1...


2026-01-18 13:46:20.903428: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1768743981.118460      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1768743981.181016      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1768743981.713387      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768743981.713423      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1768743981.713426      24 computation_placer.cc:177] computation placer alr

T5ForConditionalGeneration(
  (shared): Embedding(384, 1536)
  (encoder): T5Stack(
    (embed_tokens): Embedding(384, 1536)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1536, out_features=768, bias=False)
              (k): Linear(in_features=1536, out_features=768, bias=False)
              (v): Linear(in_features=1536, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=1536, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1536, out_features=3968, bias=False)
              (wi_1): Linear(in_features=1536, out_features=3968, bias=False)
              (

In [2]:
class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = [clean_text(t) for t in df['transliteration']]
        # Add prefix
        self.texts = ["translate Akkadian to English: " + t for t in self.texts]
        self.tokenizer = tokenizer
        self.ids = df['id'].tolist()
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text, 
            padding="max_length", 
            truncation=True, 
            max_length=MAX_LENGTH, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "id": self.ids[idx]
        }


test_df = pd.read_csv(TEST_PATH)
test_dataset = InferenceDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

all_predictions = []
all_ids = []

print(f"Starting Inference on {len(test_df)} rows...")

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        
        
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_LENGTH,
            num_beams=5,
            early_stopping=True
        )
        
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_predictions.extend([d.strip() for d in decoded])
        all_ids.extend(batch["id"].tolist())

print("Inference Complete.")

Starting Inference on 4 rows...


  0%|          | 0/1 [00:00<?, ?it/s]

Inference Complete.


In [3]:

submission = pd.DataFrame({
    "id": all_ids,
    "translation": all_predictions
})


submission["translation"] = submission["translation"].apply(lambda x: x if len(x) > 0 else "...")

submission.to_csv("submission.csv", index=False)
