# üèõÔ∏è META NLLB-200 + DATASETS MASIVOS (OPUS)

In [None]:
!pip install -q transformers[sentencepiece] datasets accelerate -U sacremoses

import pandas as pd
from datasets import Dataset, load_dataset, concatenate_datasets
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import shutil
from google.colab import files

# 1Ô∏è‚É£ Cargar Dataset Local

In [None]:
print("üìÇ Cargando tu dataset local...")
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df_local = pd.read_csv(file_name, sep='|', quoting=3)
ds_local = Dataset.from_pandas(df_local)

# 2Ô∏è‚É£ Cargar Datasets Remotos (OPUS)

In [None]:
print("üåê Descargando datasets de OPUS (Biblia)... ")
try:
    ds_opus = load_dataset("opus_bible_uedin", lang1="la", lang2="es", split="train")
    print(f"‚úÖ OPUS cargado: {len(ds_opus)} frases nuevas.")
except Exception as e:
    print(f"‚ö†Ô∏è Error cargando OPUS: {e}. Usando solo local.")
    ds_opus = None

# 3Ô∏è‚É£ Unificar y Limpiar

In [None]:
def format_data(example):
    if 'translation' in example:
        return {"latin": example['translation']['la'], "spanish": example['translation']['es']}
    return example

ds_local = ds_local.map(format_data, remove_columns=ds_local.column_names)
if ds_opus:
    ds_opus = ds_opus.map(format_data, remove_columns=ds_opus.column_names)
    ds_final = concatenate_datasets([ds_local, ds_opus])
else:
    ds_final = ds_local

print(f"üöÄ Total frases combinadas: {len(ds_final)}")

# 4Ô∏è‚É£ Bidireccionalidad Din√°mica

In [None]:
def make_bidirectional(batch):
    src = batch["latin"] + batch["spanish"]
    tgt = batch["spanish"] + batch["latin"]
    src_lang = ["lat_Latn"] * len(batch["latin"]) + ["spa_Latn"] * len(batch["spanish"])
    tgt_lang = ["spa_Latn"] * len(batch["latin"]) + ["lat_Latn"] * len(batch["spanish"])
    return {"src": src, "tgt": tgt, "src_lang": src_lang, "tgt_lang": tgt_lang}

ds_bi = ds_final.map(make_bidirectional, batched=True, remove_columns=ds_final.column_names)

# 5Ô∏è‚É£ Modelo y Tokenizer (META NLLB)

In [None]:
MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = NllbTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# 6Ô∏è‚É£ Entrenamiento PRO

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples["src"], max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["tgt"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_ds = ds_bi.map(preprocess_function, batched=True).train_test_split(test_size=0.05)

training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb_pro_latin_es",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=1e-5,
    weight_decay=0.01,
    fp16=True,
    logging_steps=500,
    save_total_limit=3,
    evaluation_strategy="steps",
    eval_steps=2000,
    save_steps=2000,
    predict_with_generate=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer
)

print("üíé Iniciando entrenamiento PRO...")
trainer.train()

# 7Ô∏è‚É£ Guardar

In [None]:
model_dir = "./modelo_pro_nllb"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
shutil.make_archive("modelo_pro_completo", 'zip', model_dir)
files.download("modelo_pro_completo.zip")