In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/flant5/pytorch/default/1/config.json
/kaggle/input/flant5/pytorch/default/1/spiece.model
/kaggle/input/flant5/pytorch/default/1/training_args.bin
/kaggle/input/flant5/pytorch/default/1/tokenizer.json
/kaggle/input/flant5/pytorch/default/1/tokenizer_config.json
/kaggle/input/flant5/pytorch/default/1/model.safetensors
/kaggle/input/flant5/pytorch/default/1/special_tokens_map.json
/kaggle/input/flant5/pytorch/default/1/generation_config.json
/kaggle/input/deep-past-initiative-machine-translation/sample_submission.csv
/kaggle/input/deep-past-initiative-machine-translation/bibliography.csv
/kaggle/input/deep-past-initiative-machine-translation/publications.csv
/kaggle/input/deep-past-initiative-machine-translation/Sentences_Oare_FirstWord_LinNum.csv
/kaggle/input/deep-past-initiative-machine-translation/OA_Lexicon_eBL.csv
/kaggle/input/deep-past-initiative-machine-translation/eBL_Dictionary.csv
/kaggle/input/deep-past-initiative-machine-translation/train.csv
/kaggle/input/deep

In [2]:
import os
import re
import unicodedata
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# --- CONFIGURATION ---
TEST_PATH = "/kaggle/input/deep-past-initiative-machine-translation/test.csv"
MODEL_PATH = "/kaggle/input/flant5/pytorch/default/1" 

BATCH_SIZE = 16   # Safe inference batch size
MAX_LENGTH = 384  # Must match training length
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- CLEANING FUNCTION (Must match training!) ---
def clean_text(text):
    if pd.isna(text): return ""
    text = str(text)
    text = unicodedata.normalize('NFC', text)
    text = text.replace("…", "...")
    text = re.sub(r'\.\s*\.\s*\.', '...', text)
    text = text.replace('“', '"').replace('”', '"')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print(f"Loading model from {MODEL_PATH}...")

# Load Model Offline
try:
    tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/flant5/pytorch/default/1', local_files_only=True)
    model = AutoModelForSeq2SeqLM.from_pretrained('/kaggle/input/flant5/pytorch/default/1', local_files_only=True)
    model = model.to(DEVICE)
    model.eval()
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")

Loading model from /kaggle/input/flant5/pytorch/default/1...


2026-01-09 21:27:40.267959: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767994060.509883      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767994060.579001      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767994061.155095      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767994061.155149      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767994061.155153      17 computation_placer.cc:177] computation placer alr

✅ Model loaded successfully!


In [3]:
class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = [clean_text(t) for t in df['transliteration']]
        # Add prefix
        self.texts = ["translate Akkadian to English: " + t for t in self.texts]
        self.tokenizer = tokenizer
        self.ids = df['id'].tolist()
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text, 
            padding="max_length", 
            truncation=True, 
            max_length=MAX_LENGTH, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "id": self.ids[idx]
        }

# Load Test Data
test_df = pd.read_csv(TEST_PATH)
test_dataset = InferenceDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

all_predictions = []
all_ids = []

print(f"Starting Inference on {len(test_df)} rows...")

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        
        # Beam Search for best translation quality
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_LENGTH,
            num_beams=4,
            early_stopping=True
        )
        
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_predictions.extend([d.strip() for d in decoded])
        all_ids.extend(batch["id"].tolist())

print("Inference Complete.")

Starting Inference on 4 rows...


  0%|          | 0/1 [00:00<?, ?it/s]

Inference Complete.


In [4]:
# Create DataFrame
submission = pd.DataFrame({
    "id": all_ids,
    "translation": all_predictions
})

# Final Sanity Check: Fill empty predictions if any exist
submission["translation"] = submission["translation"].apply(lambda x: x if len(x) > 0 else "...")

# Save
submission.to_csv("submission.csv", index=False)
print("submission.csv saved.")
print(submission.head())

submission.csv saved.
   id                                        translation
0   0  From the Kanesh colony to the Ah-alim, our mes...
1   1  On the tablet of the City I shall not take any...
2   2  When you hear our letter, either to the palace...
3   3  The top-packs I sent to the Kanesh colony and ...
