# üåè META NLLB-200: ENTRENAMIENTO BIDIRECCIONAL PERSONALIZADO

In [None]:
!pip install -q transformers[sentencepiece] datasets accelerate -U sacremoses

import pandas as pd
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
import os
import shutil
from google.colab import files

# 1Ô∏è‚É£ Carga de Datos

In [None]:
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name, sep='|', quoting=3)

# 2Ô∏è‚É£ Aumentar datos para BIDIRECCIONALIDAD

In [None]:
df_la_es = df.copy()
df_la_es.columns = ["src", "tgt"]
df_la_es["src_lang"] = "lat_Latn"
df_la_es["tgt_lang"] = "spa_Latn"

df_es_la = df.copy()
df_es_la.columns = ["tgt", "src"]
df_es_la["src_lang"] = "spa_Latn"
df_es_la["tgt_lang"] = "lat_Latn"

df_bidirectional = pd.concat([df_la_es, df_es_la], ignore_index=True)
print(f"‚úÖ Dataset bidireccional listo: {len(df_bidirectional)} filas.")

# 3Ô∏è‚É£ Cargar Modelo de META

In [None]:
MODEL_NAME = "facebook/nllb-200-distilled-600M"
tokenizer = NllbTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# 4Ô∏è‚É£ Entrenamiento

In [None]:
dataset = Dataset.from_pandas(df_bidirectional).shuffle(seed=42)
dataset_split = dataset.train_test_split(test_size=0.05)

def preprocess(examples):
    model_inputs = tokenizer(examples["src"], max_length=128, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["tgt"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset_split.map(preprocess, batched=True)

training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb_latin_es_bi",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    save_total_limit=2,
    evaluation_strategy="epoch",
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer
)

print("üöÄ Iniciando entrenamiento con Meta NLLB...")
trainer.train()

# 5Ô∏è‚É£ Guardado

In [None]:
model_dir = "./modelo_meta_final"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
shutil.make_archive("nllb_bi_final_zip", 'zip', model_dir)
files.download("nllb_bi_final_zip.zip")