In [1]:
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score
import evaluate

In [2]:
# Load the SQuAD v2 dataset
squad_v2_train = load_dataset("rajpurkar/squad_v2", split="train")
squad_v2_test = load_dataset("rajpurkar/squad_v2", split="validation")

In [3]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [4]:
# Tokenize the datasets
# def tokenize_function(examples):
#     return tokenizer(
#         examples['question'],
#         examples['context'],
#         truncation="only_second",
#         max_length=384,
#         stride=128,
#         return_overflowing_tokens=True,
#         return_offsets_mapping=True,
#         padding="max_length",
#     )

In [5]:
# Prepare train features function
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [6]:
# tokenized_train = squad_v2_train.map(tokenize_function, batched=True, remove_columns=squad_v2_train.column_names)
# tokenized_test = squad_v2_test.map(tokenize_function, batched=True, remove_columns=squad_v2_test.column_names)

# Tokenize the datasets
tokenized_train = squad_v2_train.map(prepare_train_features, batched=True, remove_columns=squad_v2_train.column_names)
tokenized_test = squad_v2_test.map(prepare_train_features, batched=True, remove_columns=squad_v2_test.column_names)

In [7]:
tokenized_test

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 12134
})

In [10]:
# Initialize the model
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Define a compute_metrics function to calculate metrics
metric = evaluate.load("squad_v2", trust_remote_code=True)

def compute_metrics(p):
    predictions = p.predictions
    references = p.label_ids
    start_logits, end_logits = predictions
    start_positions, end_positions = references

    # Convert predictions and references to the required format for the metric
    n_best_size = 20
    max_answer_length = 30
    predictions = []
    references = []

    for i in range(len(p.label_ids)):
        start_logits[i], end_logits[i] = start_logits[i].tolist(), end_logits[i].tolist()
        start_position, end_position = start_positions[i], end_positions[i]
        
        # Get the n-best start and end logits
        start_indexes = np.argsort(start_logits[i])[-1:-n_best_size-1:-1]
        end_indexes = np.argsort(end_logits[i])[-1:-n_best_size-1:-1]
        
        for start_index in start_indexes:
            for end_index in end_indexes:
                if start_index <= end_index and end_index - start_index + 1 <= max_answer_length:
                    predictions.append({
                        "id": p.example_id[i],
                        "prediction_text": tokenizer.convert_tokens_to_string(
                            tokenizer.convert_ids_to_tokens(p.input_ids[i][start_index:end_index+1])
                        )
                    })
                    references.append({
                        "id": p.example_id[i],
                        "answers": {
                            "text": [tokenizer.convert_tokens_to_string(
                                tokenizer.convert_ids_to_tokens(p.input_ids[i][start_position:end_position+1])
                            )],
                            "answer_start": [start_position]
                        }
                    })
                    break

    return metric.compute(predictions=predictions, references=references)


In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

In [13]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,  # Add the compute_metrics function
)

In [14]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss


AttributeError: 'EvalPrediction' object has no attribute 'example_id'

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

In [None]:
print(f"Evaluation results: {eval_results}")