In [1]:
import os
import re
import unicodedata
import pandas as pd
import numpy as np
import torch
import gc
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    DataCollatorForSeq2Seq, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)
from datasets import Dataset, DatasetDict

# Clear memory first
gc.collect()
torch.cuda.empty_cache()

# Hardware check
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
if device == "cuda":
    print(f"GPU Count: {torch.cuda.device_count()}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

2026-01-09 21:12:00.094797: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767993120.333202      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767993120.404856      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767993120.973427      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767993120.973460      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767993120.973463      23 computation_placer.cc:177] computation placer alr

Using device: cuda
GPU Count: 2
GPU Name: Tesla T4


In [2]:

TRAIN_FILE = "/kaggle/input/deep-past-initiative-machine-translation/train.csv"

MODEL_PATH = "/kaggle/working/flant5_small_final_optimized"


MAX_LENGTH = 384       
BATCH_SIZE = 10         
GRAD_ACCUMULATION = 8  
LEARNING_RATE = 1e-3   
NUM_EPOCHS = 3
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)

In [3]:
def clean_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text)
    
    
    text = unicodedata.normalize('NFC', text)
    
    
    text = text.replace("…", "...")
    text = re.sub(r'\.\s*\.\s*\.', '...', text) 
    
    
    text = text.replace('“', '"').replace('”', '"')
    
    
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def is_valid_translation(text):
    
    if not text or len(text) < 2:
        return False
    
    
    clean_check = re.sub(r'[^\w\s]', '', text)
    if len(clean_check) == 0:
        return False
        
    return True

def remove_noisy(df):
    oare_ids = []
    with open('/kaggle/input/outliers/oare_ids_outilers_iqr.txt','r') as f:
        oare_ids = [line.strip() for line in f.readlines()]
    mask = ~df['oare_id'].isin(oare_ids)
    return mask
    
def preprocess_dataframe(df):
    print(f"Original Row Count: {len(df)}")
    
    
    df['transliteration'] = df['transliteration'].apply(clean_text)
    df['translation'] = df['translation'].apply(clean_text)
    df_clean = df[remove_noisy(df)]
    
    valid_mask = df['translation'].apply(is_valid_translation)
    df_cleaned = df[valid_mask].copy()
    
    print(f"Cleaned Row Count: {len(df_clean)}")
    print(f"Dropped {len(df) - len(df_clean)} rows (empty/broken translations)")
    
    return df_cleaned

In [4]:

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, local_files_only=True)


def preprocess_function(examples):
    # Prefix
    inputs = ["translate Akkadian to English: " + str(ex) for ex in examples["transliteration"]]
    targets = [str(ex) for ex in examples["translation"]]
    
    # Tokenize inputs
    model_inputs = tokenizer(
        inputs, 
        max_length=MAX_LENGTH, 
        truncation=True,
        padding=False # Dynamic padding is faster
    )
    
    # Tokenize targets
    labels = tokenizer(
        text_target=targets, 
        max_length=MAX_LENGTH, 
        truncation=True,
        padding=False
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/kaggle/working/flant5_small_final_optimized'. Use `repo_type` argument if needed.

In [None]:
print(f"Loading model from {MODEL_PATH}...")


model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_PATH, 
    local_files_only=True,
    use_safetensors=True 
)
model = model.to(device)

# Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="byt5_small_akkadian_optimized",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    
    # MEMORY SAVING SETTINGS
    per_device_train_batch_size=BATCH_SIZE,        # 2
    per_device_eval_batch_size=BATCH_SIZE,         # 2
    gradient_accumulation_steps=GRAD_ACCUMULATION, # 8
    gradient_checkpointing=True,                   # <--- THE MAGIC FIX (Saves 50% VRAM)
    optim="adafactor",                             # <--- Uses less memory than AdamW
    
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=NUM_EPOCHS,
    predict_with_generate=True,
    fp16=True,                  
    report_to="none",
    load_best_model_at_end=True,
    logging_steps=50,
)

In [None]:
from sklearn.model_selection import KFold
df = pd.read_csv(TRAIN_FILE)


df = preprocess_dataframe(df)
history = []
kfold = KFold(n_splits = 5,shuffle = True,)
for train_indices,val_indices in tqdm(kfold.split(df)):
    train_df = df.iloc[train_indices,:]
    val_df = df.iloc[val_indices,:]


    raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_df),
    "validation": Dataset.from_pandas(val_df)
    })
    
    print("Sample Transliteration:", train_df.iloc[0]['transliteration'],'\n')
    print("Sample Translation:", train_df.iloc[0]['translation'])

    tokenized_datasets = raw_datasets.map(
    preprocess_function, 
    batched=True,
    remove_columns=raw_datasets["train"].column_names
    )
    print("Tokenization Complete.")

    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    )

    print("Starting Memory-Optimized Training...")
    trainer.train()
    # SAVE
    final_path = "./flant5_small_final_optimized"
    trainer.save_model(final_path)
    tokenizer.save_pretrained(final_path)
    print(f"Saved model to {final_path}")

In [None]:
history

In [None]:
final_path = "./flanT5_small_CV_optimized"
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)
print(f"Saved model to {final_path}")

In [None]:
import os
import re
import unicodedata
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# --- CONFIGURATION ---
TEST_PATH = "/kaggle/input/deep-past-initiative-machine-translation/test.csv"
MODEL_PATH = "/kaggle/working/flant5_small_final_optimized" 

BATCH_SIZE = 16   # Safe inference batch size
MAX_LENGTH = 384  # Must match training length
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# --- CLEANING FUNCTION (Must match training!) ---
def clean_text(text):
    if pd.isna(text): return ""
    text = str(text)
    text = unicodedata.normalize('NFC', text)
    text = text.replace("…", "...")
    text = re.sub(r'\.\s*\.\s*\.', '...', text)
    text = text.replace('“', '"').replace('”', '"')
    text = re.sub(r'\s+', ' ', text).strip()
    return text

print(f"Loading model from {MODEL_PATH}...")

# Load Model Offline
try:
    tokenizer = AutoTokenizer.from_pretrained('/kaggle/working/flant5_small_final_optimized', local_files_only=True)
    model = AutoModelForSeq2SeqLM.from_pretrained('/kaggle/working/flant5_small_final_optimized', local_files_only=True)
    model = model.to(DEVICE)
    model.eval()
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")

In [None]:
class InferenceDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.texts = [clean_text(t) for t in df['transliteration']]
        # Add prefix
        self.texts = ["translate Akkadian to English: " + t for t in self.texts]
        self.tokenizer = tokenizer
        self.ids = df['id'].tolist()
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(
            text, 
            padding="max_length", 
            truncation=True, 
            max_length=MAX_LENGTH, 
            return_tensors="pt"
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "id": self.ids[idx]
        }

# Load Test Data
test_df = pd.read_csv(TEST_PATH)
test_dataset = InferenceDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

all_predictions = []
all_ids = []

print(f"Starting Inference on {len(test_df)} rows...")

with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        
        # Beam Search for best translation quality
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=MAX_LENGTH,
            num_beams=4,
            early_stopping=True
        )
        
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        all_predictions.extend([d.strip() for d in decoded])
        all_ids.extend(batch["id"].tolist())

print("Inference Complete.")

In [None]:
# Create DataFrame
submission = pd.DataFrame({
    "id": all_ids,
    "translation": all_predictions
})

# Final Sanity Check: Fill empty predictions if any exist
submission["translation"] = submission["translation"].apply(lambda x: x if len(x) > 0 else "...")

# Save
submission.to_csv("submission.csv", index=False)
print("submission.csv saved.")
print(submission.head())