# 实现简易版本

In [1]:
from transformers import Trainer, TrainingArguments, AutoModelForQuestionAnswering, AutoTokenizer, DefaultDataCollator, DataCollatorWithPadding
from datasets import load_dataset

In [2]:
datasets = load_dataset("hfl/cmrc2018")
print(datasets)

Using the latest cached version of the dataset since hfl/cmrc2018 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/ubuntu/.cache/huggingface/datasets/hfl___cmrc2018/default/0.0.0/137f2c45a24275fb68f6961c4d357f46288886aa (last modified on Fri Dec 13 13:57:10 2024).


DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 10142
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 3219
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1002
    })
})


In [3]:
sample_datasets = datasets["train"].select(range(10))
print(sample_datasets)

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 10
})


In [4]:
checkpoint = "hfl/chinese-macbert-base"
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint, num_labels=2)  # 问答只有开始和结束
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def process(examples):
    answer_list = examples["answers"]
    
    tokenized_examples = tokenizer(
        text=examples["question"],
        text_pair=examples["context"],
        truncation="only_second",  # 如果问题已经超过最大长度上线，是会报错的,
        max_length=384,
        padding="longest",
        return_offsets_mapping=True,
    )
        
    offset_list = tokenized_examples["offset_mapping"]
    start_positions_list = []
    end_positions_list = []
    for answer, offset in zip(answer_list, offset_list):
        # 定位答案在字符串中的位置
        answer_text = answer["text"][0]
        answer_start = answer["answer_start"][0]
        answer_end = answer_start + len(answer_text)
        
        # 定位context在token的范围
        start_context_index = tokenized_examples.sequence_ids().index(1)
        end_context_index = tokenized_examples.sequence_ids().index(None, start_context_index)
        
        token_start = None
        token_end = None    
        # 找到答案对应的token位置 只在给定的上下文范围内找
        for i, (offset_start, offset_end) in enumerate(offset[start_context_index:end_context_index]):
            if offset_start == answer_start:
                token_start = i + start_context_index
            elif offset_end == answer_end:
                token_end = i + start_context_index
        
        # 如果没找到答案，则使用cls
        if token_end is None or token_start is None:
            token_start = 0
            token_end = 0
        start_positions_list.append(token_start)
        end_positions_list.append(token_end)
    tokenized_examples["start_positions"] = start_positions_list
    tokenized_examples["end_positions"] = end_positions_list
    return tokenized_examples
    

In [6]:
tokenized_datasets = datasets.map(process, batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/10142 [00:00<?, ? examples/s]

Map:   0%|          | 0/3219 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
        num_rows: 10142
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
        num_rows: 3219
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'start_positions', 'end_positions'],
        num_rows: 1002
    })
})


In [7]:
test_tokenizer_datasets = tokenized_datasets["train"]
start_token = test_tokenizer_datasets[0]["start_positions"]
end_token = test_tokenizer_datasets[0]["end_positions"]

answer = test_tokenizer_datasets[0]["input_ids"][start_token:end_token+1]
tokenizer.decode(answer)

'1963 年'

In [8]:
args  = TrainingArguments(
    "output",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    eval_strategy="steps",
    eval_steps=30
)

In [9]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    # data_collator=DefaultDataCollator(),
    data_collator=DataCollatorWithPadding(tokenizer),
    processing_class=tokenizer,
)

In [10]:
trainer.train()



Step,Training Loss,Validation Loss
30,2.2871,1.748305
60,1.7487,1.357984
90,1.4225,1.34843
120,1.3028,1.286842
150,1.2972,1.30059
180,1.0657,1.359534
210,1.0416,1.299335
240,1.0934,1.250832


TrainOutput(global_step=240, training_loss=1.6052495082219442, metrics={'train_runtime': 589.6174, 'train_samples_per_second': 51.603, 'train_steps_per_second': 0.407, 'total_flos': 5962661340337152.0, 'train_loss': 1.6052495082219442, 'epoch': 3.0})

In [21]:
from transformers import pipeline

checkpoint = "output/checkpoint-240"
pipe = pipeline("question-answering",  model=checkpoint , device="cuda:0")
print(pipe)

Device set to use cuda:0


<transformers.pipelines.question_answering.QuestionAnsweringPipeline object at 0x7f781dbd61e0>


In [22]:
question = "小明在那里上学"
context = "小明在北京上学"
print(pipe(question=question, context=context))

{'score': 0.49330437183380127, 'start': 3, 'end': 5, 'answer': '北京'}


In [24]:
question = "周杰伦什么时候出生的"
context = "周杰伦（1979年1月18日—），台湾创作男歌手、演员、词曲作家及制作人。其音乐风行于大中华地区及全球各地的华人社群，并对华语乐坛产生重大影响，也是史上最具影响力及最著名的华语歌手之一[3][4][5]。"
print(pipe(question=question, context=context))

{'score': 0.016886375844478607, 'start': 4, 'end': 14, 'answer': '1979年1月18日'}


# overlap版本

In [8]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, AutoTokenizer, DataCollatorWithPadding, Trainer
from datasets import load_dataset
from collections import defaultdict

In [2]:
datasets = load_dataset("hfl/cmrc2018")
print(datasets)

Using the latest cached version of the dataset since hfl/cmrc2018 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/ubuntu/.cache/huggingface/datasets/hfl___cmrc2018/default/0.0.0/137f2c45a24275fb68f6961c4d357f46288886aa (last modified on Fri Dec 13 16:46:44 2024).


DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 10142
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 3219
    })
    test: Dataset({
        features: ['id', 'context', 'question', 'answers'],
        num_rows: 1002
    })
})


In [3]:
checkpoint = "hfl/chinese-macbert-base"
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint, num_labels=2)  # 问答只有开始和结束
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
sample_datasets = datasets["train"].select(range(10))
sample_datasets

Dataset({
    features: ['id', 'context', 'question', 'answers'],
    num_rows: 10
})

In [16]:

answer_list = sample_datasets["answers"]
context_list = sample_datasets["context"]
id_list = sample_datasets["id"]  # 如果数据没有 得使用with_transformers给他生成id
tokenized_sample_datasets = tokenizer(
    text=sample_datasets["question"],
    text_pair=sample_datasets["context"],
    truncation="only_second",
    padding="longest",
    return_offsets_mapping=True,
    return_overflowing_tokens=True,
    stride=128,
    max_length=384
)


# 确定每条的对应关系  主要是为了答案能对的上
overflow_to_sample_mapping = tokenized_sample_datasets["overflow_to_sample_mapping"]
input_ids = tokenized_sample_datasets["input_ids"]
offset_mapping = tokenized_sample_datasets["offset_mapping"]  # overflow之后  offset_mapping是保持原来的句子顺序的

start_position_list = []
end_position_list = []
for i, (index, offset, input_id) in enumerate(zip(overflow_to_sample_mapping, offset_mapping, input_ids)):
    # 确定上下文的token位置
    start_context_index = tokenized_sample_datasets.sequence_ids(i).index(1)
    end_context_index = tokenized_sample_datasets.sequence_ids(i).index(None, start_context_index)
    
    # 确定答案的字符长度
    answer = answer_list[index]
    answer_text = answer["text"][0]
    answer_start = answer["answer_start"][0]
    answer_end = answer_start + len(answer_text)
    
    # 确定答案的token位置
    start_token_index = None
    end_token_index = None
    for i, (start_token, end_token) in enumerate(offset[start_context_index:end_context_index]):
        if start_token == answer_start:
            start_token_index = i + start_context_index
        elif end_token == answer_end:
            end_token_index = i + start_context_index
    
    if start_token_index is None or end_token_index is None:
        start_token_index = 0
        end_token_index = 0
    
    start_position_list.append(start_token_index)
    end_position_list.append(end_token_index)
print(len(tokenized_sample_datasets["input_ids"]))
print(len(start_position_list))




29
29


In [4]:
def process(examples):
    answer_list = examples["answers"]
    
    tokenized_examples = tokenizer(
        text=examples["question"],
        text_pair=examples["context"],
        truncation="only_second",
        padding=True,
        return_offsets_mapping=True,
        return_overflowing_tokens=True,  # 用这个参数 必须同一个长度 padding 必须长度相同
        stride=128,
        max_length=384
    )


    # 确定每条的对应关系  主要是为了答案能对的上
    overflow_to_sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]  # overflow之后  offset_mapping是保持原来的句子顺序的

    start_position_list = []
    end_position_list = []
    for i, (index, offset, ) in enumerate(zip(overflow_to_sample_mapping, offset_mapping)):
        # 确定上下文的token位置
        start_context_index = tokenized_examples.sequence_ids(i).index(1)
        end_context_index = tokenized_examples.sequence_ids(i).index(None, start_context_index)
        
        # 确定答案的字符长度
        answer = answer_list[index]
        answer_text = answer["text"][0]
        answer_start = answer["answer_start"][0]
        answer_end = answer_start + len(answer_text)
        
        # 确定答案的token位置
        start_token_index = None
        end_token_index = None
        for i, (start_token, end_token) in enumerate(offset[start_context_index:end_context_index]):
            if start_token == answer_start:
                start_token_index = i + start_context_index
            elif end_token == answer_end:
                end_token_index = i + start_context_index
        
        if start_token_index is None or end_token_index is None:
            start_token_index = 0
            end_token_index = 0
        
        start_position_list.append(start_token_index)
        end_position_list.append(end_token_index)
    tokenized_examples["start_positions"] = start_position_list
    tokenized_examples["end_positions"] = end_position_list
    return tokenized_examples

In [5]:
tokenized_datasets = datasets.map(process, batched=True, remove_columns=datasets["train"].column_names)  # 这里一定要移除列名，因为overflow会生成数据，导致行数不匹配
print(tokenized_datasets)

Map:   0%|          | 0/10142 [00:00<?, ? examples/s]

Map:   0%|          | 0/3219 [00:00<?, ? examples/s]

Map:   0%|          | 0/1002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
        num_rows: 19189
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
        num_rows: 6327
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
        num_rows: 1988
    })
})


In [6]:
args  = TrainingArguments(
    "output",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",
    save_steps=500,
    eval_strategy="steps",
    eval_steps=10
)

In [9]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    # data_collator=DefaultDataCollator(),
    data_collator=DataCollatorWithPadding(tokenizer),
    processing_class=tokenizer,
)

In [10]:
trainer.train()



Step,Training Loss,Validation Loss
10,4.4976,3.046438
20,2.608,1.970672
30,1.9752,1.655736
40,1.7893,1.490922
50,1.715,1.391165
60,1.6724,1.332616
70,1.5198,1.314015
80,1.5166,1.327852
90,1.4348,1.23875
100,1.4604,1.285821


TrainOutput(global_step=450, training_loss=1.2976347965664334, metrics={'train_runtime': 2323.0681, 'train_samples_per_second': 24.781, 'train_steps_per_second': 0.194, 'total_flos': 1.1281552796265984e+16, 'train_loss': 1.2976347965664334, 'epoch': 3.0})

In [11]:
from transformers import pipeline

checkpoint = "output/checkpoint-450"
pipe = pipeline("question-answering",  model=checkpoint , device="cuda:0")
print(pipe)

Device set to use cuda:0


<transformers.pipelines.question_answering.QuestionAnsweringPipeline object at 0x75ecb7c13830>


In [12]:
question = "小明在那里上学"
context = "小明在北京上学"
print(pipe(question=question, context=context))

{'score': 0.251699835062027, 'start': 3, 'end': 5, 'answer': '北京'}


In [13]:
question = "周杰伦什么时候出生的"
context = "周杰伦（1979年1月18日—），台湾创作男歌手、演员、词曲作家及制作人。其音乐风行于大中华地区及全球各地的华人社群，并对华语乐坛产生重大影响，也是史上最具影响力及最著名的华语歌手之一[3][4][5]。"
print(pipe(question=question, context=context))

{'score': 0.3174186944961548, 'start': 4, 'end': 14, 'answer': '1979年1月18日'}


In [14]:
question = "为什么周杰伦在流行音乐创作方面如鱼得水"
context = """
周杰伦在台湾台北县林口乡[注 1]出生长大[12]，为家中的独生子[13][14]。父亲周耀中任教于芦洲国中，教授生物[15]；母亲叶惠美则是林口国中美术老师。14岁时父母离异，由父亲担任监护人，年满18岁后选择与母亲共同生活[16]。周杰伦曾在台湾民视新闻台由胡婉玲主持的节目《台湾演义》专访中澄清《爸，我回来了》这首歌只是对社会上家暴现象的感慨，并非指涉父母间的状况；父亲的亲戚也曾质疑过他，因此还为特别向亲戚们澄清和道歉过[17]。

周杰伦自小对音乐表现出浓厚的兴趣，并且喜欢模仿歌星、演员表演和变魔术。3岁开始学习钢琴。周杰伦国小时住在台北市光华商场附近，就读忠孝国小。国中时就读金华国中[18]，此时期他的父母因长年争执而决议离婚，使周杰伦的性情大受影响。除了音乐外，周杰伦热爱篮球，在国中曾参加过篮球队。

高中就读于台北县私立淡江中学第一届音乐科（本来是想报考华冈艺校，但错过了报名时间，幸好淡江中学恰巧新设了音乐科），主修钢琴，为将来的音乐发展打下了深厚的基础[19]。这时的他因正值青春期，常常秀琴技想吸引女同学的注意。但学科成绩不甚理想，故高中毕业时，大学联考落榜。又因患有僵直性脊椎炎，依据台湾兵役制度得以免服义务兵役[20]。

周杰伦曾表示少年时受到香港乐坛“四大天王”之一张学友的专辑《吻别》的影响，从而喜欢并开始专注于流行音乐[21]。另外他也透露过，除了张学友以外，肖邦、李恕权与史帝夫·汪达也是他童年及成长时影响他很深的人：05年的专辑更以《十一月的肖邦》为标题，07年电影《不能说的秘密》的斗琴多处桥段和肖邦有关，示意对他致敬；李恕权每回出现在电视上，周杰伦便会在电视机面前模仿他；而史帝夫·汪达有一首《I Just Called to Say I love You》是他的婶婶曾在他叔父的葬礼中播放的歌曲。由于他的音乐基础扎实，令其在流行音乐创作方面如鱼得水。"""
print(pipe(question=question, context=context))


{'score': 0.47723063826560974, 'start': 774, 'end': 784, 'answer': '由于他的音乐基础扎实'}
