# 基于截断策略的机器阅读理解任务实现

#### Step1. 导入相关包

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering,TrainingArguments, Trainer,DefaultDataCollator

#### Step2. 加载数据集

In [None]:
datasets = load_dataset("cmrc2018", cache_dir="data")
datasets

In [1]:
datasets["train"][0]

NameError: name 'datasets' is not defined

#### Step3. 数据预处理

In [None]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
tokenizer

In [None]:
sample_dataset = datasets["train"].select(range(10))

In [None]:
tokenized_examples = tokenizer(text=sample_dataset["question"], text_pair=sample_dataset["context"],return_offsets_mapping=True,max_length=512,truncation="only_second", padding="max_length")
print(tokenized_examples["input_ids"][0])
print(len(tokenized_examples["input_ids"][0]))
tokenized_examples.keys()

In [None]:
#print(list(zip(tokenized_examples["input_ids"][0], tokenized_examples["token_type_ids"][0])))

In [None]:
tokenized_examples["offset_mapping"][0]
print(len(tokenized_examples["offset_mapping"][0]))

In [None]:
print(tokenized_examples.sequence_ids(0))

In [None]:
offset_mapping = tokenized_examples.pop("offset_mapping")

for idx, offset in enumerate(offset_mapping):
    answer = sample_dataset["answers"][idx]
    start_char = answer["answer_start"][0]
    end_char = start_char + len(answer["text"][0])
    # 定位答案在token中的起始位置和结束位置
    # 一种策略，我们要拿到context的起始和结束，然后从左右两侧向答案逼近
    context_start = tokenized_examples.sequence_ids(idx).index(1)
    context_end = tokenized_examples.sequence_ids(idx).index(None, context_start)-1

    # 判断答案是否在context中
    if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
        start_token_pos = 0
        end_token_pos = 0
    else:
        token_id = context_start
        while token_id <= context_end and offset[token_id][0] < start_char:
            token_id += 1
        start_token_pos = token_id
        token_id = context_end
        while token_id >= context_start and offset[token_id][1] > end_char:
            token_id -= 1
        end_token_pos = token_id
    print(answer, start_char,end_char, context_start, context_end, start_token_pos, end_token_pos)
    print("token answer decoder:", tokenizer.decoder(tokenized_examples["input_ids"][idx][start_token_pos:end_token_pos+1]))

In [None]:
def process_function(examples):
    tokenized_examples = tokenizer(text=examples["question"], text_pair=examples["context"],return_offsets_mapping=True,max_length=512,truncation="only_second", padding="max_length")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    start_positions = []
    end_positions = []
    for idx, offset in enumerate(offset_mapping):
        answer = examples["answers"][idx]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])
        # 定位答案在token中的起始位置和结束位置
        # 一种策略，我们要拿到context的起始和结束，然后从左右两侧向答案逼近
        context_start = tokenized_examples.sequence_ids(idx).index(1)
        context_end = tokenized_examples.sequence_ids(idx).index(None, context_start)-1

        # 判断答案是否在context中
        if offset[context_end][1] < start_char or offset[context_start][0] > end_char:
            start_token_pos = 0
            end_token_pos = 0
        else:
            token_id = context_start
            while token_id <= context_end and offset[token_id][0] < start_char:
                token_id += 1
            start_token_pos = token_id
            token_id = context_end
            while token_id >= context_start and offset[token_id][1] > end_char:
                token_id -= 1
            end_token_pos = token_id
        start_positions.append(start_token_pos)
        end_positions.append(end_token_pos)
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

In [None]:
tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

#### Step4. 加载模型

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("hf1/chinese-macbert-base")

#### Step5. 配置TrainArguments

In [None]:
args = TrainingArguments(
    output_dir="model_for_qa",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=3
)

#### Step6. 创建训练器

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=DefaultDataCollator()
)

#### Step7 模型训练

In [None]:
trainer.train()

#### Step8 模型预测

In [None]:
from transformers import pipeline

pipe = pipeline("question-answering", model=model, tokenizer=trainer, device=0)
pipe

In [None]:
pipe(question="小明在哪里上班", context="小明在北京上班")