# 0Ô∏è‚É£ Instalar dependencias

In [None]:
!pip install -q transformers[sentencepiece] datasets accelerate -U sentencepiece tokenizers sacremoses

import pandas as pd
from datasets import Dataset
from transformers import MarianMTModel, MarianTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
import pickle
import shutil
import os
from google.colab import files
import torch

# 1Ô∏è‚É£ Subir dataset CSV

In [None]:
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Usamos el separador | como en tu ejemplo
df = pd.read_csv(file_name, sep='|', quoting=3)
print(f"‚úÖ Dataset cargado. Total filas: {len(df)}")
print(df.head(3))

# 2Ô∏è‚É£ Cargar MarianMT base y su Tokenizer

In [None]:
# IMPORTANTE: Usamos el mismo tokenizer para TODO el proceso.
# El modelo 'itc-itc' ya est√° optimizado para lenguas romances (Lat√≠n inclu√≠do).
MODEL_NAME = "Helsinki-NLP/opus-mt-itc-itc"
tokenizer = MarianTokenizer.from_pretrained(MODEL_NAME)
model = MarianMTModel.from_pretrained(MODEL_NAME)
print(f"‚úÖ Tokenizer cargado. Vocab size: {len(tokenizer.get_vocab())}")

# 3Ô∏è‚É£ Preparar Dataset BIDIRECCIONAL

In [None]:
# Duplicamos los datos invirtiendo las columnas y a√±adiendo prefijos
df_la_es = df.copy()
df_la_es["src"] = ">>es<< " + df_la_es["latin"].astype(str)
df_la_es["tgt"] = df_la_es["spanish"].astype(str)

df_es_la = df.copy()
df_es_la["src"] = ">>la<< " + df_es_la["spanish"].astype(str)
df_es_la["tgt"] = df_es_la["latin"].astype(str)

df_final = pd.concat([df_la_es[["src", "tgt"]], df_es_la[["src", "tgt"]]], ignore_index=True)
print(f"‚úÖ Dataset ampliado: {len(df_final)} filas (32k la->es + 32k es->la)")

# 4Ô∏è‚É£ Tokenizar dataset

In [None]:
dataset = Dataset.from_pandas(df_final).shuffle(seed=42)
dataset_split = dataset.train_test_split(test_size=0.05)

def preprocess(examples):
    model_inputs = tokenizer(examples["src"], max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(text_target=examples["tgt"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_data = dataset_split.map(preprocess, batched=True)

# 5Ô∏è‚É£ Configurar entrenamiento

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./modelo_biblia_lat_es",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    num_train_epochs=8, # Ajusta seg√∫n necesites
    learning_rate=3e-5,
    lr_scheduler_type="linear",
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,
    logging_steps=200,
    evaluation_strategy="steps",
    eval_steps=500,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer
)

# 6Ô∏è‚É£ Entrenar modelo

In [None]:
print("üöÄ Iniciando entrenamiento...")
trainer.train()

# 7Ô∏è‚É£ Guardar TODO correctamente

In [None]:
# save_pretrained guarda el modelo Y el tokenizer que realmente se us√≥.
model_dir = "./modelo_final_lat_es"
model.save_pretrained(model_dir, safe_serialization=False) # Guarda pytorch_model.bin + config.json
tokenizer.save_pretrained(model_dir) # Guarda vocab.json, source.spm, target.spm y configs

# Guardar training_args para referencia
with open(os.path.join(model_dir, "training_args.bin"), "wb") as f:
    pickle.dump(training_args, f)

# 8Ô∏è‚É£ Verificar y Comprimir

In [None]:
essential_files = [
    "pytorch_model.bin",
    "config.json",
    "vocab.json",
    "source.spm",
    "target.spm",
    "tokenizer_config.json",
    "special_tokens_map.json"
]

print("\nüîç Verificando archivos generados...")
for f in essential_files:
    path = os.path.join(model_dir, f)
    if os.path.exists(path):
        print(f"‚úÖ {f} encontrado")
    else:
        print(f"‚ùå {f} NO encontrado")

zip_name = "marian_latin_spanish_model_CORRECTED"
shutil.make_archive(zip_name, 'zip', model_dir)
files.download(zip_name + ".zip")
print(f"\nüéâ ¬°Listo! Descarga el archivo {zip_name}.zip")