In [1]:
!pip install torch torchvision transformers datasets

import torch
from transformers import BertTokenizer, BertForQuestionAnswering
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

# Paso 1: Cargar el conjunto de datos SQuAD v1.0
dataset = load_dataset("squad_v1")

# Paso 2: Cargar el modelo BERT pre-entrenado y el tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

# Paso 3: Preprocesamiento y tokenización de los datos
def prepare_data(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        padding="max_length",
        max_length=384,
    )
    tokenized_examples["start_positions"] = examples["answers"]["answer_start"]
    tokenized_examples["end_positions"] = [
        start + len(answer_text) - 1
        for start, answer_text in zip(
            examples["answers"]["answer_start"], examples["answers"]["text"]
        )
    ]
    return tokenized_examples

train_dataset = dataset["train"].map(prepare_data, batched=True)
dev_dataset = dataset["validation"].map(prepare_data, batched=True)

# Paso 4: Configurar los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir="./squad_qa",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=2,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    disable_tqdm=False,
)

# Paso 5: Entrenar el modelo con el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()

# Paso 6: Evaluar el modelo en el conjunto de desarrollo
eval_results = trainer.evaluate(eval_dataset=dev_dataset)
print(eval_results)




FileNotFoundError: Couldn't find a dataset script at /home/roberto.lopez/Documentos/Git/Cesga2023Courses/pytorch/squad_v1/squad_v1.py or any data file in the same directory. Couldn't find 'squad_v1' on the Hugging Face Hub either: FileNotFoundError: Dataset 'squad_v1' doesn't exist on the Hub