In [9]:
import torch
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, Trainer, TrainingArguments
import numpy as np
from tqdm.auto import tqdm

In [10]:
# Load the SQuAD v2 dataset
squad_v2_train = load_dataset("rajpurkar/squad_v2", split="train")
squad_v2_test = load_dataset("rajpurkar/squad_v2", split="validation")

In [11]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [12]:
# Prepare train features function
def prepare_train_features(examples):
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [13]:
# Tokenize the datasets
tokenized_train = squad_v2_train.map(prepare_train_features, batched=True, remove_columns=squad_v2_train.column_names)
tokenized_test = squad_v2_test.map(prepare_train_features, batched=True, remove_columns=squad_v2_test.column_names)

In [14]:
tokenized_test

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 12134
})

In [15]:
# Convert datasets to PyTorch Dataset
class SquadDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

In [16]:
train_dataset = SquadDataset(tokenized_train)
test_dataset = SquadDataset(tokenized_test)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [17]:
# Initialize the model
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

NameError: name 'DistilBertForQuestionAnswering' is not defined

In [None]:
# Initialize the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

In [8]:
# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [9]:
# Evaluation loop
model.eval()
metric = evaluate.load("squad_v2", trust_remote_code=True)
all_start_logits = []
all_end_logits = []
all_labels = []
for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        start_logits = outputs.start_logits.cpu().numpy()
        end_logits = outputs.end_logits.cpu().numpy()
        all_start_logits.append(start_logits)
        all_end_logits.append(end_logits)
        all_labels.append({
            "id": batch["id"],
            "answers": batch["answers"]
        })

In [10]:
# Flatten predictions and labels
all_start_logits = np.concatenate(all_start_logits, axis=0)
all_end_logits = np.concatenate(all_end_logits, axis=0)
predictions = {
    "start_logits": all_start_logits,
    "end_logits": all_end_logits
}
metric_results = metric.compute(predictions=predictions, references=all_labels)
print(f"Evaluation results: {metric_results}")