In [45]:
from transformers import AutoTokenizer
from datasets import load_from_disk
import tensorflow as tf

In [46]:
dataset = load_from_disk("./dataset/processed_dataset")

In [47]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [51]:
def find_labels(offsets, answer_start, answer_end, sequence_ids):
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx -1
    if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:
        return(0, 0)
    else:
        idx = context_start
        while idx <= context_end and offsets[idx][0] <= answer_start:
            idx += 1
        start_position = idx - 1
        idx = context_end
        while idx >= context_start and offsets[idx][1] >= answer_end:
            idx -= 1
        end_position = idx + 1
    return start_position, end_position

In [52]:
def tokenization(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation = "only_second",
        padding = "max_length",
        max_length = 384,
        stride = 128,
        return_overflowing_tokens = True,
        return_offsets_mapping = True
    )
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    inputs["start_positions"] = []
    inputs["end_positions"] = []
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        start, end = find_labels(
            offset,
            examples["answer_start"][sample_idx],
            examples["answer_end"][sample_idx],
            inputs.sequence_ids(i),
        )
        inputs["start_positions"].append(start)
        inputs["end_positions"].append(end)
    return inputs

In [53]:
tokenized_dataset = dataset.map(
    tokenization,
    batched = True,
    remove_columns = dataset["train"].column_names
)

Map: 100%|██████████| 87599/87599 [00:38<00:00, 2288.75 examples/s]


In [54]:
tokenized_dataset.save_to_disk("./dataset/tokenized_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 88524/88524 [00:00<00:00, 321891.11 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 10784/10784 [00:00<00:00, 300632.58 examples/s]
