In [10]:
from transformers import AutoTokenizer
import pickle

In [3]:
with open('./dataset/processed_dataset.pickle', 'rb') as handle:
    dataset = pickle.load(handle)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [17]:
def find_labels(offsets, answer_start, answer_end, sequence_ids):
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx -1

    if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:
        return(0, 0)
    else:
        idx = context_start
        while idx <= context_end and offsets[idx][0] <= answer_start:
            idx += 1
        start_position = idx - 1
        idx = context_end
        while idx >= context_start and offsets[idx][1] >= answer_end:
            idx -= 1
        end_position = idx + 1

    return start_position, end_position

In [18]:
def tokenization(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        example["question"],
        example["context"],
        truncation = "only_second",
        padding = "max_length",
        max_length = 384,
        stride = 128,
        return_overflowing_tokens = True,
        return_offsets_mapping = True
    )
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    inputs["start_positions"] = []
    inputs["end_positions"] = []
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        start, end = find_labels(
            offset,
            examples["answer_start"][sample_idx],
            examples["answer_end"][sample_idx],
            inputs.sequence_ids(i),
        )

        inputs["start_positions"].append(start)
        inputs["end_positions"].append(end)
    
    return inputs

In [19]:
tokenized_dataset = dataset.map(
    tokenization,
    batched = True,
    remove_columns = dataset["train"].column_names
)

Map: 100%|██████████| 87599/87599 [00:00<00:00, 231821.75 examples/s]
Map: 100%|██████████| 10570/10570 [00:00<00:00, 220704.49 examples/s]


In [21]:
with open('./dataset/tokenized_dataset.pickle', 'wb') as handle:
    pickle.dump(tokenized_dataset, handle)