In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

In [2]:
# Load the SQuAD dataset
raw_datasets = load_dataset("rajpurkar/squad")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [7]:
raw_train_dataset = raw_datasets["train"]
raw_validation_dataset = raw_datasets["validation"]
raw_train_dataset[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [8]:
# Get unique titles in the train and validation datasets
unique_train_titles = set(raw_train_dataset['title'])
unique_validation_titles = set(raw_validation_dataset['title'])

num_unique_train_titles = len(unique_train_titles)
num_unique_validation_titles = len(unique_validation_titles)

print(f"Number of unique titles in the training dataset: {num_unique_train_titles}")
print(f"Number of unique titles in the test dataset: {num_unique_validation_titles}")

Number of unique titles in the training dataset: 442
Number of unique titles in the test dataset: 48


In [9]:
# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

In [16]:
# Tokenize the datasets
def tokenize_function_new(examples):
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    # Create a mapping from new tokens to the original examples
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [17]:
# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [18]:
tokenized_datasets = raw_datasets.map(tokenize_function_new, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

ArrowInvalid: Column 5 named input_ids expected length 1000 but got length 1032

In [None]:
tokenized_train = squad_v2_train.map(tokenize_function, batched=True, remove_columns=squad_v2_train.column_names)
tokenized_test = squad_v2_test.map(tokenize_function, batched=True, remove_columns=squad_v2_test.column_names)

In [10]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

ArrowInvalid: Column 5 named input_ids expected length 1000 but got length 1032

In [11]:
tokenized_datasets

NameError: name 'tokenized_datasets' is not defined

# Prepare for training

In [14]:
from transformers import BertForQuestionAnswering

model = BertForQuestionAnswering.from_pretrained(checkpoint, num_labels=442)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Training loop

In [13]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

NameError: name 'model' is not defined