In [1]:
!pip install transformers datasets torch





[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import json
import torch
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# 📌 Cargar el corpus JSON
with open("3-corpus_texto_estructurado.json", "r", encoding="utf-8") as f:
    corpus = json.load(f)

# 📌 Unir los textos en una sola lista
text_data = [item for item in corpus]

# 📌 Cargar el tokenizador de BETO
tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

# 📌 Tokenizar los datos
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# 📌 Crear dataset en formato Hugging Face
dataset = Dataset.from_dict({"text": text_data})
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 📌 Configurar el modelo de BETO para MLM
model = BertForMaskedLM.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

# 📌 Data Collator para MLM (automáticamente enmascara palabras)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15  # 🔹 Enmascarar el 15% de las palabras
)

# 📌 Configurar entrenamiento
training_args = TrainingArguments(
    output_dir="./beto_pretrained",
    evaluation_strategy="epoch",   # Evaluar al final de cada epoch
    save_strategy="epoch",         # Guardar modelo en cada epoch
    per_device_train_batch_size=8, # Ajusta según tu GPU
    per_device_eval_batch_size=8,
    num_train_epochs=3,            # Entrenar por 3 épocas
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
)

# 📌 Definir métricas de evaluación
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)

    mask = labels != -100  # Ignorar tokens enmascarados
    accuracy = (predictions[mask] == labels[mask]).mean()

    return {"accuracy": accuracy}

# 📌 Entrenar el modelo con Hugging Face Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Evaluación en el mismo conjunto (o dividir en train/test)
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 🚀 Iniciar el entrenamiento
trainer.train()







Map:   0%|          | 0/1106 [00:00<?, ? examples/s]

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
metrics = trainer.evaluate()
print(metrics)


In [None]:
model.save_pretrained("./beto_preentrenado")
tokenizer.save_pretrained("./beto_preentrenado")


In [None]:
from transformers import BertForMaskedLM, BertTokenizer

model = BertForMaskedLM.from_pretrained("./beto_preentrenado")
tokenizer = BertTokenizer.from_pretrained("./beto_preentrenado")


### FINE TUNING ###

In [None]:
!pip install transformers datasets torch


In [None]:
from sklearn.model_selection import train_test_split

# 📌 Dividir en 80% entrenamiento y 20% prueba
train_questions, test_questions, train_answers, test_answers = train_test_split(
    questions, answers, test_size=0.2, random_state=42
)

# 📌 Crear datasets para Hugging Face
train_dataset = Dataset.from_dict({"question": train_questions, "context": train_answers})
test_dataset = Dataset.from_dict({"question": test_questions, "context": test_answers})

# 📌 Tokenizar
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


In [None]:
import json
import torch
import numpy as np
from datasets import Dataset
from transformers import BertTokenizer, BertForQuestionAnswering, TrainingArguments, Trainer

# 📌 Cargar el corpus JSON de preguntas y respuestas
with open("3-corpus_preguntas_respuestas.json", "r", encoding="utf-8") as f:
    qa_corpus = json.load(f)

# 📌 Extraer preguntas y respuestas en listas separadas
questions = [pair[0] for pair in qa_corpus]
answers = [pair[1] for pair in qa_corpus]

# 📌 Cargar el tokenizador y el modelo preentrenado de BETO
tokenizer = BertTokenizer.from_pretrained("./beto_preentrenado")
model = BertForQuestionAnswering.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")

# 📌 Función para tokenizar preguntas y respuestas
def preprocess_function(examples):
    inputs = tokenizer(examples["question"], examples["context"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    return inputs

# 📌 Crear dataset en formato Hugging Face
dataset = Dataset.from_dict({"question": questions, "context": answers})
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 📌 Configurar parámetros de entrenamiento
training_args = TrainingArguments(
    output_dir="./beto_finetuned_qa",
    evaluation_strategy="epoch",   # Evaluar al final de cada epoch
    save_strategy="epoch",         # Guardar modelo en cada epoch
    per_device_train_batch_size=8, # Ajusta según tu GPU
    per_device_eval_batch_size=8,
    num_train_epochs=3,            # Entrenar por 3 épocas
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=100,
)

# 📌 Definir la función de pérdida y métricas
def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)

    accuracy = (predictions == labels).mean()  # 🔹 Precisión en preguntas
    return {"accuracy": accuracy}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,  # Ahora evaluamos en datos de prueba
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 🚀 Iniciar el fine-tuning
trainer.train()


In [None]:
#Usamos la CrossEntropyLoss, que es la predeterminada en BertForQuestionAnswering. Se ajusta bien a QA, ya que el modelo debe predecir el índice de inicio y fin de la respuesta dentro del contexto.
#
#📌 Métricas incluidas:
#✅ Precisión (Accuracy): Mide cuántas preguntas fueron respondidas correctamente.
#✅ Loss (Pérdida): Evalúa la diferencia entre las respuestas predichas y reales.
#✅ F1-score (Opcional): Podemos agregarlo para evaluar mejor la calidad de las respuestas.
metrics = trainer.evaluate()
print(metrics)




In [None]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    logits = eval_pred.predictions
    labels = eval_pred.label_ids
    predictions = np.argmax(logits, axis=-1)

    accuracy = (predictions == labels).mean()  
    f1 = f1_score(labels, predictions, average="weighted")  # Agregamos F1-score

    return {"accuracy": accuracy, "f1": f1}


In [None]:
model.save_pretrained("./beto_finetuned_qa")
tokenizer.save_pretrained("./beto_finetuned_qa")


In [None]:
from transformers import BertForQuestionAnswering, BertTokenizer

model = BertForQuestionAnswering.from_pretrained("./beto_finetuned_qa")
tokenizer = BertTokenizer.from_pretrained("./beto_finetuned_qa")
