<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/TRANSLATOR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Sat Dec 20 14:39:44 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## TRAINING

In [None]:
import torch
import gc
import logging
import warnings

# 1. SUPPRESS ALL WARNINGS (Corrected function name)
warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

from transformers import (
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from datasets import Dataset

# 2. CLEANUP
gc.collect()
torch.cuda.empty_cache()
device = "cuda" if torch.cuda.is_available() else "cpu"

# 3. LOAD MODEL
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)

# 4. VOCABULARY EXPANSION
akk_token = "[akk_AK]"
tokenizer.add_special_tokens({'additional_special_tokens': [akk_token]})

# FIX: mean_resizing=False stops the multivariate normal distribution warning
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
tokenizer.lang_code_to_id[akk_token] = tokenizer.convert_tokens_to_ids(akk_token)

# 5. DATASET
data = {
    "akkadian": ["šarrum bītam iṣbat", "ilum ana bītim ittalak", "ekallam īpuš"],
    "spanish": ["el rey tomó la casa", "el dios fue a la casa", "él construyó el palacio"]
}
dataset = Dataset.from_dict(data)

def preprocess(examples):
    tokenizer.src_lang = akk_token
    tokenizer.tgt_lang = "es_XX"
    model_inputs = tokenizer(examples["akkadian"], max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["spanish"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset.map(preprocess, batched=True)

# 6. TRAINING ARGS
training_args = Seq2SeqTrainingArguments(
    output_dir="./akkadian_translator_v3",
    per_device_train_batch_size=2,
    num_train_epochs=100,
    learning_rate=5e-5,
    report_to="none",
    save_strategy="no",
    logging_steps=50,
    disable_tqdm=False
)

# 7. TRAINER (Using 'processing_class' to avoid v5.0 warnings)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    processing_class=tokenizer,
)

# 8. RUN
print("Starting Clean Training...")
trainer.train()
print("Training Finished!")

# 9. INFERENCE TEST
def translate(text):
    model.eval()
    tokenizer.src_lang = akk_token
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id["es_XX"],
            max_new_tokens=20,
            do_sample=False
        )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

print(f"\nAkkadian: šarrum bītam iṣbat")
print(f"Español: {translate('šarrum bītam iṣbat')}")

In [None]:
!pip install colab-env -q

In [None]:
import colab_env
import os
from huggingface_hub import login
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

# 1. AUTHENTICATION
access_token_write = os.getenv("HUGGINGFACE_ACCESS_TOKEN_WRITE")
if access_token_write:
    login(token=access_token_write, add_to_git_credential=True)

    # 2. RE-SAVE WITH ALL NECESSARY FILES
    # We save to a fresh directory to ensure no missing vocabulary files
    local_save_path = "./akkadian_final_export"
    print(f"Saving a complete copy to {local_save_path}...")

    # These commands ensure both the weights and the sentencepiece model are saved
    model.save_pretrained(local_save_path)
    tokenizer.save_pretrained(local_save_path)

    # 3. EXPORT TO HUGGING FACE
    repo_id = "frankmorales2020/kkadian-to-spanish-translator"
    print(f"Pushing to Hugging Face: {repo_id}...")

    # Pushing directly ensures the custom [akk_AK] token is preserved
    model.push_to_hub(repo_id)
    tokenizer.push_to_hub(repo_id)

    print(f"Successfully exported! URL: https://huggingface.co/{repo_id}")
else:
    print("Error: HUGGINGFACE_ACCESS_TOKEN_WRITE not found.")

In [16]:
import os

# Create the directory if it's missing
os.makedirs("./akkadian_translator_v3", exist_ok=True)

# Manually save the model and tokenizer currently in memory
print("Saving model and tokenizer to ./akkadian_translator_v3...")
model.save_pretrained("./akkadian_translator_v3")
tokenizer.save_pretrained("./akkadian_translator_v3")

# Verify the files are now present
print(f"Files now in directory: {os.listdir('./akkadian_translator_v3')}")

Saving model and tokenizer to ./akkadian_translator_v3...
Files now in directory: ['special_tokens_map.json', 'tokenizer_config.json', 'config.json', 'model.safetensors', 'generation_config.json']


In [17]:
# Force the tokenizer to save its source vocabulary file
tokenizer.save_vocabulary("./akkadian_translator_v3")

# Verify again - you MUST see 'sentencepiece.bpe.model' in this list
print(f"Updated files: {os.listdir('./akkadian_translator_v3')}")

Updated files: ['special_tokens_map.json', 'tokenizer_config.json', 'config.json', 'model.safetensors', 'generation_config.json']


In [None]:
from huggingface_hub import HfApi
import os

api = HfApi()
repo_id = "frankmorales2020/kkadian-to-spanish-translator"
local_path = "./akkadian_translator_v3"

# Push every file to the Hub, ensuring we overwrite the incomplete versions
for file_name in os.listdir(local_path):
    file_path = os.path.join(local_path, file_name)
    if os.path.isfile(file_path):
        print(f"Force uploading {file_name}...")
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file_name,
            repo_id=repo_id,
            repo_type="model"
        )

print(f"\nSuccess! Your model is now fully functional at: https://huggingface.co/{repo_id}")

In [None]:
import os
from transformers import MBart50TokenizerFast
from huggingface_hub import hf_hub_download, HfApi

repo_id = "frankmorales2020/kkadian-to-spanish-translator"
local_dir = "./fixed_tokenizer"
os.makedirs(local_dir, exist_ok=True)

# 1. Descargar el vocabulario base de mBART-50 (el archivo que falta)
print("Obteniendo vocabulario base...")
vocab_path = hf_hub_download(repo_id="facebook/mbart-large-50-many-to-many-mmt", filename="sentencepiece.bpe.model")

# 2. Cargar el tokenizer base y añadir tu token especial
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer.add_special_tokens({'additional_special_tokens': ['[akk_AK]']})

# 3. Guardar todo en la carpeta local
tokenizer.save_pretrained(local_dir)
import shutil
shutil.copy(vocab_path, os.path.join(local_dir, "sentencepiece.bpe.model"))

# 4. SUBIDA FORZADA A HUGGING FACE
print("Subiendo archivos corregidos...")
api = HfApi()
for file_name in os.listdir(local_dir):
    api.upload_file(
        path_or_fileobj=os.path.join(local_dir, file_name),
        path_in_repo=file_name,
        repo_id=repo_id
    )
print("¡Hecho! Ahora el repositorio tiene el archivo .model necesario.")

## HF TESTING

In [24]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
import torch

# 1. Repository Configuration
repo_id = "frankmorales2020/kkadian-to-spanish-translator"

print(f"Loading corrected model and tokenizer from {repo_id}...")

# 2. Explicit Loading
# We use MBart50TokenizerFast to ensure compatibility with the updated files
tokenizer = MBart50TokenizerFast.from_pretrained(repo_id)
model = MBartForConditionalGeneration.from_pretrained(repo_id)

def translate_from_hf(text):
    # Set the custom source language token you added during training
    tokenizer.src_lang = "[akk_AK]"

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt")

    # Dynamically retrieve the ID for Spanish
    es_id = tokenizer.convert_tokens_to_ids("es_XX")

    # Generate the translation
    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=es_id,
            max_new_tokens=50
        )

    # Decode to readable text
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]


Loading corrected model and tokenizer from frankmorales2020/kkadian-to-spanish-translator...


In [25]:
# 3. Final Test
akkadian_phrase = "šarrum bītam iṣbat"
print("-" * 30)
print(f"Akkadian: {akkadian_phrase}")
print(f"Spanish:  {translate_from_hf(akkadian_phrase)}")
print("-" * 30)

------------------------------
Akkadian: šarrum bītam iṣbat
Spanish:  el rey tomó la casa
------------------------------


In [None]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
import torch

repo_id = "frankmorales2020/kkadian-to-spanish-translator"
tokenizer = MBart50TokenizerFast.from_pretrained(repo_id)
model = MBartForConditionalGeneration.from_pretrained(repo_id)

def translate(text):
    tokenizer.src_lang = "[akk_AK]"
    inputs = tokenizer(text, return_tensors="pt")

    # Force Spanish output
    es_id = tokenizer.convert_tokens_to_ids("es_XX")

    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=es_id,
            max_new_tokens=50,
            num_beams=5
        )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

# Verification
print(f"Final Result: {translate('šarrum bītam iṣbat')}")

In [None]:
# Verification
print(f"Final Result: {translate('šarrum bītam iṣbat')}")

In [3]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
import torch

# 1. Repository Configuration
repo_id = "frankmorales2020/kkadian-to-spanish-translator"

# 2. Load Model and Tokenizer
tokenizer = MBart50TokenizerFast.from_pretrained(repo_id)
model = MBartForConditionalGeneration.from_pretrained(repo_id)

def translate_and_print(text):
    # Setup for Akkadian source
    tokenizer.src_lang = "[akk_AK]"
    inputs = tokenizer(text, return_tensors="pt")

    # Get ID for Spanish target
    es_id = tokenizer.convert_tokens_to_ids("es_XX")

    # Generate translation
    with torch.no_grad():
        generated_tokens = model.generate(
            **inputs,
            forced_bos_token_id=es_id,
            max_new_tokens=60,
            num_beams=5
        )

    # Decode result
    translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    # Print comparison
    print("-" * 40)
    print(f"AKKADIAN: {text}")
    print(f"SPANISH:  {translation}")
    print("-" * 40)

# 3. Final Verification Run
translate_and_print("šarrum bītam iṣbat")

----------------------------------------
AKKADIAN: šarrum bītam iṣbat
SPANISH:  el rey tomó la casa
----------------------------------------


In [4]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
import torch

repo_id = "frankmorales2020/kkadian-to-spanish-translator"
tokenizer = MBart50TokenizerFast.from_pretrained(repo_id)
model = MBartForConditionalGeneration.from_pretrained(repo_id)

def test_model(sentences):
    tokenizer.src_lang = "[akk_AK]"
    es_id = tokenizer.convert_tokens_to_ids("es_XX")

    for text in sentences:
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=es_id,
                max_new_tokens=60,
                num_beams=5
            )
        translation = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

        print("-" * 40)
        print(f"AKKADIAN: {text}")
        print(f"SPANISH:  {translation}")

# List your test cases here
examples = [
    "šarrum bītam iṣbat",
    "ekallam īpuš"
]

test_model(examples)
print("-" * 40)

----------------------------------------
AKKADIAN: šarrum bītam iṣbat
SPANISH:  el rey tomó la casa
----------------------------------------
AKKADIAN: ekallam īpuš
SPANISH:  él construyó el palacio
----------------------------------------
