<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/TRANSLATOR_ENG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colab-env -q

In [None]:
import torch
import gc
import os
import logging
import warnings
from transformers import (
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import Dataset

# 1. SETUP & CLEANUP
warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)
gc.collect()
torch.cuda.empty_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. LOAD MODEL
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# 3. VOCABULARY EXPANSION (Using your token logic)
# We add the custom Akkadian token just as in your original code
akk_token = "[akk_AK]"
tokenizer.add_special_tokens({'additional_special_tokens': [akk_token]})
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
tokenizer.lang_code_to_id[akk_token] = tokenizer.convert_tokens_to_ids(akk_token)

# 4. DATASET (Akkadian to English)
data = {
    "akkadian": ["šarrum bītam iṣbat", "ilum ana bītim ittalak", "ekallam īpuš"],
    "english": ["the king took the house", "the god went to the house", "he built the palace"]
}
dataset = Dataset.from_dict(data)

def preprocess(examples):
    tokenizer.src_lang = akk_token
    tokenizer.tgt_lang = "en_XX" # Targeted to English
    model_inputs = tokenizer(examples["akkadian"], max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["english"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset.map(preprocess, batched=True)

# 5. TRAINING
training_args = Seq2SeqTrainingArguments(
    output_dir="./akkadian_en_v1",
    per_device_train_batch_size=2,
    num_train_epochs=100,
    learning_rate=5e-5,
    report_to="none",
    save_strategy="no",
    logging_steps=50
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    processing_class=tokenizer,
)

In [2]:
print("Starting Training...")
trainer.train()
print("Training Finished!")

Starting Training...
{'loss': 5.231, 'grad_norm': 15.058450698852539, 'learning_rate': 3.775e-05, 'epoch': 25.0}
{'loss': 0.0173, 'grad_norm': 0.11921962350606918, 'learning_rate': 2.525e-05, 'epoch': 50.0}
{'loss': 0.0006, 'grad_norm': 0.02486632764339447, 'learning_rate': 1.2750000000000002e-05, 'epoch': 75.0}
{'loss': 0.0002, 'grad_norm': 0.020006544888019562, 'learning_rate': 2.5000000000000004e-07, 'epoch': 100.0}
{'train_runtime': 62.4274, 'train_samples_per_second': 4.806, 'train_steps_per_second': 3.204, 'train_loss': 1.312280866568908, 'epoch': 100.0}
Training Finished!


In [None]:
# 6. EXPORT TO HUGGING FACE (With Repository Creation)
import colab_env
from huggingface_hub import login, HfApi, create_repo

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

if access_token_write:
    login(token=access_token_write, add_to_git_credential=True)
    api = HfApi()

    # 1. Define Repo ID
    repo_id = "frankmorales2020/akkadian-to-english-translator"

    # 2. CREATE THE REPOSITORY FIRST (This fixes the 404 error)
    print(f"Ensuring repository exists: {repo_id}...")
    try:
        create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
        print("Repository ready.")
    except Exception as e:
        print(f"Note on repo creation: {e}")

    # 3. Save files locally
    local_save_path = "./akkadian_en_final_export"
    os.makedirs(local_save_path, exist_ok=True)
    model.save_pretrained(local_save_path)
    tokenizer.save_pretrained(local_save_path)
    tokenizer.save_vocabulary(local_save_path)

    # 4. Upload files
    print(f"Pushing to Hugging Face...")
    for file_name in os.listdir(local_save_path):
        api.upload_file(
            path_or_fileobj=os.path.join(local_save_path, file_name),
            path_in_repo=file_name,
            repo_id=repo_id
        )
    print(f"Successfully exported! URL: https://huggingface.co/{repo_id}")
else:
    print("Error: HUGGINGFACE_ACCESS_TOKEN_WRITE not found.")

In [4]:
# 7. INFERENCE TEST
def translate(text):
    tokenizer.src_lang = akk_token
    inputs = tokenizer(text, return_tensors="pt").to(device)
    en_id = tokenizer.convert_tokens_to_ids("en_XX")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=en_id,
            max_new_tokens=40
        )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

print(f"\nVerification: {translate('šarrum bītam iṣbat')}")


Verification: the king took the house


## Expand the Akkadian dataset

In [None]:
import torch
import gc
import os
import logging
import warnings
from transformers import (
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import Dataset

# 1. SETUP & CLEANUP
warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)
gc.collect()
torch.cuda.empty_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"

# 2. LOAD MODEL
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# 3. VOCABULARY EXPANSION
akk_token = "[akk_AK]"
tokenizer.add_special_tokens({'additional_special_tokens': [akk_token]})
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
tokenizer.lang_code_to_id[akk_token] = tokenizer.convert_tokens_to_ids(akk_token)

# 4. DATASET (Akkadian to English)
data0 = {
    "akkadian": [
        "šarrum bītam iṣbat",
        "ilum ana bītim ittalak",
        "ekallam īpuš",
        "šumma awīlum bītam īpušma",
        "DIŠ TA ina ZI-ut UTU"
    ],
    "english": [
        "the king took the house",
        "the god went to the house",
        "he built the palace",
        "if a man builds a house",
        "the 1st at sunrise"
    ]
}


data = {
    "akkadian": [
        # --- Legal Formulas (Code of Hammurabi style) ---
        "šumma awīlum bītam īpušma",
        "šumma awīlum mār awīlim iṣbat",
        "dinum šū ina pī mātim liššakin",
        "šumma nēmtum ina ekallim ibašši",
        "šumma wardum bēlšu ittabal",
        "šumma mārī šiprim ittalkū",
        "šumma awīlum rugummâm ītaššū",
        "šumma tamkārum kaspam iddin",
        "šumma dīnum šū dīn kitti",
        "šumma šarrum mēšaram ištakan",

        # --- Astronomical Observations (Diary style) ---
        "DIŠ TA ina ZI-ut UTU",
        "ina rēš šattim rādu ištakan",
        "Sin u Šamaš itti ahāmeš innamrū",
        "kakkabū šamê ištēniš izzazzū",
        "mūšu šat-urri rādu rād",
        "bibbu ina libbi Zuqaqīpī ittiqi",
        "līmu ša Šat-Marduk ina Bābili",
        "šubtu nēhtu ina māti ibašši",
        "Idiglat u Purattu mīla imlū",
        "kakkab ra-bi-i ittanmar"
    ],
    "english": [
        # --- English Translations ---
        "if a man builds a house",
        "if a man seizes the son of a man",
        "let this judgment be established in the land",
        "if there is a grievance in the palace",
        "if a slave strikes his master",
        "if the messengers have departed",
        "if a man brings a legal claim",
        "if a merchant has given silver",
        "if this judgment is a just judgment",
        "if the king has established justice",

        "the 1st at sunrise",
        "at the beginning of the year a rainstorm occurred",
        "the Moon and Sun were seen together",
        "the stars of the heaven stand together",
        "during the morning watch there was a storm",
        "a planet passed through the heart of Scorpio",
        "the year of Shat-Marduk in Babylon",
        "there is a peaceful dwelling in the land",
        "the Tigris and Euphrates were filled with the flood",
        "a great star was seen"
    ]
}


dataset = Dataset.from_dict(data)

def preprocess(examples):
    tokenizer.src_lang = akk_token
    tokenizer.tgt_lang = "en_XX"
    model_inputs = tokenizer(examples["akkadian"], max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["english"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset.map(preprocess, batched=True)

# 5. TRAINING
training_args = Seq2SeqTrainingArguments(
    output_dir="./akkadian_en_v1",
    per_device_train_batch_size=2,
    num_train_epochs=100,
    learning_rate=5e-5,
    report_to="none",
    save_strategy="no",
    logging_steps=50
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    processing_class=tokenizer,
)

trainer.train()

# 6. EXPORT TO HUGGING FACE
!pip install colab-env -q
import colab_env
from huggingface_hub import login, HfApi, create_repo

access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")

if access_token_write:
    login(token=access_token_write, add_to_git_credential=True)
    api = HfApi()
    repo_id = "frankmorales2020/akkadian-to-english-translator"

    try:
        create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
    except Exception as e:
        pass

    local_save_path = "./akkadian_en_final_export"
    os.makedirs(local_save_path, exist_ok=True)
    model.save_pretrained(local_save_path)
    tokenizer.save_pretrained(local_save_path)
    tokenizer.save_vocabulary(local_save_path)

    for file_name in os.listdir(local_save_path):
        api.upload_file(
            path_or_fileobj=os.path.join(local_save_path, file_name),
            path_in_repo=file_name,
            repo_id=repo_id
        )

In [12]:
# 7. INFERENCE TEST
def translate(text):
    tokenizer.src_lang = akk_token
    inputs = tokenizer(text, return_tensors="pt").to(device)
    en_id = tokenizer.convert_tokens_to_ids("en_XX")
    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=en_id,
            max_new_tokens=40
        )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

print(translate("šarrum bītam iṣbat"))

the year of Shat-tam in Babylon


In [None]:
import torch
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

# 1. SETUP
repo_id = "frankmorales2020/akkadian-to-english-translator"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading model from {repo_id}...")

# Use a standard language for initial loading to avoid the KeyError
tokenizer = MBart50TokenizerFast.from_pretrained(repo_id, src_lang="en_XX", tgt_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained(repo_id).to(device)

# 2. MANUALLY REGISTER THE CUSTOM TOKEN
# This ensures '[akk_AK]' exists in the mapping before we use it
akk_token = "[akk_AK]"
if akk_token not in tokenizer.lang_code_to_id:
    tokenizer.add_special_tokens({'additional_special_tokens': [akk_token]})
    tokenizer.lang_code_to_id[akk_token] = tokenizer.convert_tokens_to_ids(akk_token)

# 3. TEST FUNCTION
def translate_akkadian_to_english(text):
    # Now it is safe to set the source language
    tokenizer.src_lang = akk_token

    inputs = tokenizer(text, return_tensors="pt").to(device)
    en_id = tokenizer.convert_tokens_to_ids("en_XX")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=en_id,
            max_new_tokens=60,
            num_beams=5
        )

    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

In [10]:
# 4. RUN TESTS
test_phrases = [
    "šarrum bītam iṣbat",
    "ekallam īpuš",
    "šumma awīlum bītam īpušma",
    "DIŠ TA ina ZI-ut UTU"
]

print("\n" + "="*50)
print("TEST RESULTS")
print("="*50)

for phrase in test_phrases:
    result = translate_akkadian_to_english(phrase)
    print(f"AKKADIAN: {phrase}")
    print(f"ENGLISH:  {result}")
    print("-" * 50)


TEST RESULTS
AKKADIAN: šarrum bītam iṣbat
ENGLISH:  the king took the house
--------------------------------------------------
AKKADIAN: ekallam īpuš
ENGLISH:  he built the palace
--------------------------------------------------
AKKADIAN: šumma awīlum bītam īpušma
ENGLISH:  if a man builds a house
--------------------------------------------------
AKKADIAN: DIŠ TA ina ZI-ut UTU
ENGLISH:  the 1st at sunrise
--------------------------------------------------


In [14]:
import torch
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

# 1. SETUP
repo_id = "frankmorales2020/akkadian-to-english-translator"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading updated model from {repo_id}...")

# Initialize with standard codes to prevent initialization errors
tokenizer = MBart50TokenizerFast.from_pretrained(repo_id, src_lang="en_XX", tgt_lang="en_XX")
model = MBartForConditionalGeneration.from_pretrained(repo_id).to(device)

# 2. REGISTER CUSTOM TOKEN
akk_token = "[akk_AK]"
if akk_token not in tokenizer.lang_code_to_id:
    tokenizer.add_special_tokens({'additional_special_tokens': [akk_token]})
    tokenizer.lang_code_to_id[akk_token] = tokenizer.convert_tokens_to_ids(akk_token)

# 3. TRANSLATION FUNCTION
def test_translate(text):
    tokenizer.src_lang = akk_token
    inputs = tokenizer(text, return_tensors="pt").to(device)
    en_id = tokenizer.convert_tokens_to_ids("en_XX")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=en_id,
            max_new_tokens=60,
            num_beams=5
        )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

# 4. COMPREHENSIVE TEST SUITE
# Testing legal, astronomical, and environmental categories
new_test_data = {
    "Legal Formulas": [
        "šumma wardum bēlšu ittabal",
        "šumma tamkārum kaspam iddin",
        "šumma šarrum mēšaram ištakan"
    ],
    "Astronomical Observations": [
        "Sin u Šamaš itti ahāmeš innamrū",
        "bibbu ina libbi Zuqaqīpī ittiqi",
        "kakkab ra-bi-i ittanmar"
    ],
    "Environmental/Historical": [
        "Idiglat u Purattu mīla imlū",
        "ina rēš šattim rādu ištakan",
        "šubtu nēhtu ina māti ibašši"
    ]
}

print("\n" + "="*60)
print("EXPANDED AKKADIAN-TO-ENGLISH TEST RESULTS")
print("="*60)

for category, phrases in new_test_data.items():
    print(f"\n>>> CATEGORY: {category}")
    print("-" * 30)
    for phrase in phrases:
        result = test_translate(phrase)
        print(f"AK: {phrase}")
        print(f"EN: {result}\n")

Loading updated model from frankmorales2020/akkadian-to-english-translator...

EXPANDED AKKADIAN-TO-ENGLISH TEST RESULTS

>>> CATEGORY: Legal Formulas
------------------------------
AK: šumma wardum bēlšu ittabal
EN: if a slave strikes his master

AK: šumma tamkārum kaspam iddin
EN: if a merchant has given silver

AK: šumma šarrum mēšaram ištakan
EN: if the king has established justice


>>> CATEGORY: Astronomical Observations
------------------------------
AK: Sin u Šamaš itti ahāmeš innamrū
EN: the Moon and Sun were seen together

AK: bibbu ina libbi Zuqaqīpī ittiqi
EN: a planet passed through the heart of Scorpio

AK: kakkab ra-bi-i ittanmar
EN: a great star was seen


>>> CATEGORY: Environmental/Historical
------------------------------
AK: Idiglat u Purattu mīla imlū
EN: the Tigris and Euphrates were filled with the flood

AK: ina rēš šattim rādu ištakan
EN: at the beginning of the year a rainstorm occurred

AK: šubtu nēhtu ina māti ibašši
EN: there is a peaceful dwelling in the l