In [16]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased")

ds = load_dataset("rajpurkar/squad_v2")

#tokenize the dataset w/ the tokenizer here
def tokenize_function(examples):
    return tokenizer(examples["question"],
                     examples["context"],
                     truncation="only_second",  # truncate context if needed (not the question)
                     max_length=1027,            # typical max length for SQuAD-style QA
                     stride=128,                # overlap for long contexts (for sliding window)
                     return_overflowing_tokens=True,  # create additional samples if context is too long
                     return_offsets_mapping=True,     # useful for aligning answers with tokens
                     padding="max_length")
new_ds = ds.map(tokenize_function, batched=True, remove_columns=ds["train"].column_names)
tokenized_train = new_ds['train'],
tokenized_eval = new_ds['validation']

def add_start_end_positions(examples):
    start_positions = []
    end_positions = []
    
    for i, offsets in enumerate(examples["offset_mapping"]):
        input_ids = examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # By default: no answer
        start_pos = cls_index
        end_pos = cls_index

        if len(examples["answers"][i]["text"]) == 0:
            # unanswerable → keep CLS as label
            start_positions.append(start_pos)
            end_positions.append(end_pos)
        else:
            # answerable → locate start/end
            answer = examples["answers"][i]
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])

            # Find the token span that matches the answer span
            sequence_ids = examples["sequence_ids"][i]

            # tokens corresponding to context
            token_start_index = sequence_ids.index(1)
            token_end_index = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

            # move token_start_index/token_end_index to match char positions
            for idx in range(token_start_index, token_end_index):
                if offsets[idx][0] <= start_char and offsets[idx][1] > start_char:
                    start_pos = idx
                if offsets[idx][0] < end_char and offsets[idx][1] >= end_char:
                    end_pos = idx
                    break

            start_positions.append(start_pos)
            end_positions.append(end_pos)

    examples["start_positions"] = start_positions
    examples["end_positions"] = end_positions
    return examples
tokenized_train = tokenized_train.map(add_start_end_positions, batched=True)
tokenized_eval = tokenized_eval.map(add_start_end_positions, batched=True)


train_args = TrainingArguments(output_dir="./bert-results",
                               logging_dir="./bert-logs",
                               logging_steps=50, 
                               num_train_epochs=1,
                               learning_rate=5.05e-5,
                               per_device_train_batch_size=8,
                               load_best_model_at_end=True,
                               eval_strategy="steps",
                               save_strategy="steps")

trainer = Trainer(model=model,
                  args=train_args,
                  train_dataset=tokenized_train,
                  eval_dataset=tokenized_eval
                  )

trainer.train()




Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Map: 100%|██████████| 130319/130319 [01:18<00:00, 1651.01 examples/s]
Map: 100%|██████████| 11873/11873 [00:07<00:00, 1607.92 examples/s]


AttributeError: 'tuple' object has no attribute 'map'