# 完整代码分为五个部分

本次任务的目标数据集是QNLI，完成自然语言推断任务。

给定一个问句，需要判断给定文本中是否包含该问句的正确答案。

## 下载数据集

利用load_dataset下载数据集

利用load_metric下载数据集评测指标

In [1]:
from datasets import load_dataset, load_metric

dataset = load_dataset("glue", "qnli")

metric = load_metric("glue", "qnli")


  metric = load_metric("glue", "qnli")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


简单查看一下数据集以及评测指标

In [12]:
print(dataset,dataset['train'][0])

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
}) {'question': 'When did the third Digimon series begin?', 'sentence': 'Unlike the two seasons before it and most of the seasons that followed, Digimon Tamers takes a darker and more realistic approach to its story featuring Digimon who do not reincarnate after their deaths and more complex character development in the original Japanese.', 'label': 1, 'idx': 0}


In [13]:
print(metric)

Metric(name: "glue", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "matthews_correlation": Matthew Correlation
Examples:

    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of ["mnli", "mnli_mismatched", "mnli_matched", "qnli", "rte", "wnli", "hans"]
    >>> references = [0, 1]
    >>> predictions = [0, 1]
    >>> results = glue_metric.compute(predictions=predictions, references=references)
    >>> print(res

## 对数据集进行tokenization

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("google/bert_uncased_L-2_H-128_A-2")

def preprocess_function(examples):
    return tokenizer(examples['sentence'], truncation=True, max_length=512)

encoded_data = dataset.map(preprocess_function,batched=True)

观察一下是否tokenization成功

In [15]:
print(encoded_data['train'][:5])

{'question': ['When did the third Digimon series begin?', 'Which missile batteries often have individual launchers several kilometres from one another?', "What two things does Popper argue Tarski's theory involves in an evaluation of truth?", 'What is the name of the village 9 miles north of Calafat where the Ottoman forces attacked the Russians?', 'What famous palace is located in London?'], 'sentence': ['Unlike the two seasons before it and most of the seasons that followed, Digimon Tamers takes a darker and more realistic approach to its story featuring Digimon who do not reincarnate after their deaths and more complex character development in the original Japanese.', 'When MANPADS is operated by specialists, batteries may have several dozen teams deploying separately in small sections; self-propelled air defence guns may deploy in pairs.', 'He bases this interpretation on the fact that examples such as the one described above refer to two things: assertions and the facts to which t

## 模型下载

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("google/bert_uncased_L-2_H-128_A-2",num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 设定参数

In [24]:
from transformers import TrainingArguments

batch_size = 16

args = TrainingArguments(
    output_dir="./result",
    evaluation_strategy="epoch", # 在每个epoch结束后测试效果
    save_strategy="epoch", # 每个epoch结束后保存模型
    learning_rate=2e-5, # 学习率
    per_device_train_batch_size=batch_size, # 每个GPU训练的batch size
    per_device_eval_batch_size=batch_size, # 每个GPU测试的batch size
    num_train_epochs=5, # 训练的epoch数
    weight_decay=0.01, # 权重衰减
    load_best_model_at_end=True, # 训练结束后，是否加载在验证集上表现最好的模型
    metric_for_best_model="accuracy" # 准确率为指标
)

## 训练

In [19]:
from transformers import Trainer
import numpy as np

In [25]:
def compute_metrics(eval_pred):
    logits, lables = eval_pred
    predictions = np.argmax(logits,axis=1)
    return metric.compute(predictions=predictions, references=lables)

In [26]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_data["train"],
    eval_dataset=encoded_data["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [12]:
trainer.train()

  0%|          | 0/32735 [00:00<?, ?it/s]

{'loss': 0.6887, 'learning_rate': 1.9694516572475945e-05, 'epoch': 0.08}
{'loss': 0.6715, 'learning_rate': 1.9389033144951888e-05, 'epoch': 0.15}
{'loss': 0.6656, 'learning_rate': 1.908354971742783e-05, 'epoch': 0.23}
{'loss': 0.6595, 'learning_rate': 1.8778066289903775e-05, 'epoch': 0.31}
{'loss': 0.6584, 'learning_rate': 1.8472582862379718e-05, 'epoch': 0.38}
{'loss': 0.6561, 'learning_rate': 1.816709943485566e-05, 'epoch': 0.46}
{'loss': 0.6567, 'learning_rate': 1.7861616007331604e-05, 'epoch': 0.53}
{'loss': 0.657, 'learning_rate': 1.7556132579807548e-05, 'epoch': 0.61}
{'loss': 0.6568, 'learning_rate': 1.725064915228349e-05, 'epoch': 0.69}
{'loss': 0.6604, 'learning_rate': 1.6945165724759434e-05, 'epoch': 0.76}
{'loss': 0.6575, 'learning_rate': 1.6639682297235374e-05, 'epoch': 0.84}
{'loss': 0.6457, 'learning_rate': 1.633419886971132e-05, 'epoch': 0.92}
{'loss': 0.6518, 'learning_rate': 1.6028715442187264e-05, 'epoch': 0.99}


  0%|          | 0/342 [00:00<?, ?it/s]

{'eval_loss': 0.6497064232826233, 'eval_accuracy': 0.6121178839465495, 'eval_runtime': 8.0978, 'eval_samples_per_second': 674.626, 'eval_steps_per_second': 42.234, 'epoch': 1.0}
{'loss': 0.6447, 'learning_rate': 1.5723232014663207e-05, 'epoch': 1.07}
{'loss': 0.6458, 'learning_rate': 1.541774858713915e-05, 'epoch': 1.15}
{'loss': 0.6482, 'learning_rate': 1.5112265159615092e-05, 'epoch': 1.22}
{'loss': 0.6424, 'learning_rate': 1.4806781732091035e-05, 'epoch': 1.3}
{'loss': 0.6453, 'learning_rate': 1.4501298304566977e-05, 'epoch': 1.37}
{'loss': 0.6414, 'learning_rate': 1.4195814877042922e-05, 'epoch': 1.45}
{'loss': 0.6532, 'learning_rate': 1.3890331449518865e-05, 'epoch': 1.53}
{'loss': 0.6424, 'learning_rate': 1.3584848021994808e-05, 'epoch': 1.6}
{'loss': 0.647, 'learning_rate': 1.3279364594470752e-05, 'epoch': 1.68}
{'loss': 0.6456, 'learning_rate': 1.2973881166946693e-05, 'epoch': 1.76}
{'loss': 0.6464, 'learning_rate': 1.2668397739422638e-05, 'epoch': 1.83}
{'loss': 0.6456, 'learn

  0%|          | 0/342 [00:00<?, ?it/s]

{'eval_loss': 0.644970715045929, 'eval_accuracy': 0.613399231191653, 'eval_runtime': 8.3942, 'eval_samples_per_second': 650.81, 'eval_steps_per_second': 40.743, 'epoch': 2.0}
{'loss': 0.6396, 'learning_rate': 1.1751947456850468e-05, 'epoch': 2.06}
{'loss': 0.6418, 'learning_rate': 1.144646402932641e-05, 'epoch': 2.14}
{'loss': 0.6388, 'learning_rate': 1.1140980601802354e-05, 'epoch': 2.21}
{'loss': 0.6322, 'learning_rate': 1.0835497174278296e-05, 'epoch': 2.29}
{'loss': 0.6389, 'learning_rate': 1.053001374675424e-05, 'epoch': 2.37}
{'loss': 0.6368, 'learning_rate': 1.0224530319230184e-05, 'epoch': 2.44}
{'loss': 0.6402, 'learning_rate': 9.919046891706126e-06, 'epoch': 2.52}
{'loss': 0.6421, 'learning_rate': 9.613563464182069e-06, 'epoch': 2.6}
{'loss': 0.6384, 'learning_rate': 9.308080036658012e-06, 'epoch': 2.67}
{'loss': 0.6372, 'learning_rate': 9.002596609133956e-06, 'epoch': 2.75}
{'loss': 0.6363, 'learning_rate': 8.697113181609899e-06, 'epoch': 2.83}
{'loss': 0.632, 'learning_rate

  0%|          | 0/342 [00:00<?, ?it/s]

{'eval_loss': 0.6462925672531128, 'eval_accuracy': 0.6177924217462932, 'eval_runtime': 7.8888, 'eval_samples_per_second': 692.497, 'eval_steps_per_second': 43.352, 'epoch': 3.0}
{'loss': 0.6322, 'learning_rate': 7.780662899037727e-06, 'epoch': 3.05}
{'loss': 0.6259, 'learning_rate': 7.47517947151367e-06, 'epoch': 3.13}
{'loss': 0.627, 'learning_rate': 7.169696043989614e-06, 'epoch': 3.21}
{'loss': 0.6294, 'learning_rate': 6.8642126164655576e-06, 'epoch': 3.28}
{'loss': 0.6244, 'learning_rate': 6.558729188941501e-06, 'epoch': 3.36}
{'loss': 0.6371, 'learning_rate': 6.253245761417443e-06, 'epoch': 3.44}
{'loss': 0.6355, 'learning_rate': 5.9477623338933865e-06, 'epoch': 3.51}
{'loss': 0.6336, 'learning_rate': 5.64227890636933e-06, 'epoch': 3.59}
{'loss': 0.6254, 'learning_rate': 5.336795478845274e-06, 'epoch': 3.67}
{'loss': 0.6313, 'learning_rate': 5.031312051321216e-06, 'epoch': 3.74}
{'loss': 0.6304, 'learning_rate': 4.7258286237971595e-06, 'epoch': 3.82}
{'loss': 0.6294, 'learning_rat

  0%|          | 0/342 [00:00<?, ?it/s]

{'eval_loss': 0.648369550704956, 'eval_accuracy': 0.6209042650558302, 'eval_runtime': 7.8795, 'eval_samples_per_second': 693.317, 'eval_steps_per_second': 43.404, 'epoch': 4.0}
{'loss': 0.6294, 'learning_rate': 3.809378341224989e-06, 'epoch': 4.05}
{'loss': 0.6295, 'learning_rate': 3.503894913700932e-06, 'epoch': 4.12}
{'loss': 0.6287, 'learning_rate': 3.198411486176875e-06, 'epoch': 4.2}
{'loss': 0.6187, 'learning_rate': 2.8929280586528187e-06, 'epoch': 4.28}
{'loss': 0.6265, 'learning_rate': 2.5874446311287615e-06, 'epoch': 4.35}
{'loss': 0.6352, 'learning_rate': 2.2819612036047043e-06, 'epoch': 4.43}
{'loss': 0.6223, 'learning_rate': 1.9764777760806476e-06, 'epoch': 4.51}
{'loss': 0.6219, 'learning_rate': 1.670994348556591e-06, 'epoch': 4.58}
{'loss': 0.622, 'learning_rate': 1.3655109210325341e-06, 'epoch': 4.66}
{'loss': 0.6202, 'learning_rate': 1.0600274935084774e-06, 'epoch': 4.73}
{'loss': 0.6266, 'learning_rate': 7.545440659844204e-07, 'epoch': 4.81}
{'loss': 0.6231, 'learning_

  0%|          | 0/342 [00:00<?, ?it/s]

{'eval_loss': 0.650716245174408, 'eval_accuracy': 0.6172432729269632, 'eval_runtime': 8.4368, 'eval_samples_per_second': 647.518, 'eval_steps_per_second': 40.537, 'epoch': 5.0}
{'train_runtime': 2893.0922, 'train_samples_per_second': 181.023, 'train_steps_per_second': 11.315, 'train_loss': 0.6398471140653937, 'epoch': 5.0}


TrainOutput(global_step=32735, training_loss=0.6398471140653937, metrics={'train_runtime': 2893.0922, 'train_samples_per_second': 181.023, 'train_steps_per_second': 11.315, 'train_loss': 0.6398471140653937, 'epoch': 5.0})

## 测试

In [28]:
model_validation = AutoModelForSequenceClassification.from_pretrained("result/checkpoint-32735")

trainer = Trainer(
    model=model_validation,
    args=args,
    train_dataset=encoded_data["train"],
    eval_dataset=encoded_data["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.evaluate()

  0%|          | 0/342 [00:00<?, ?it/s]

{'eval_loss': 0.650716245174408,
 'eval_accuracy': 0.6172432729269632,
 'eval_runtime': 9.1348,
 'eval_samples_per_second': 598.046,
 'eval_steps_per_second': 37.439}

In [33]:
from transformers import pipeline

classifier = pipeline("text-classification",model="result/checkpoint-32735")

predictions = []

for item in encoded_data["test"]:
    predictions.append(classifier(item["question"]+item["sentence"]))

In [34]:
output_file = "QNLI.tsv"

with open(output_file, "w") as f:
    f.write("index\tprediction\n")
    for i,item in enumerate(predictions):
        pred_str = "not_entailment" if (item[0]["label"] == "LABEL_0") else "entailment"
        f.write(f"{i}\t{pred_str}\n")


与官方的示例提交作比较

注意，官方的示例提交并没有说他是百分之一百正确率

In [35]:
input_file = "official_QNLI.tsv"

with open(output_file, 'r') as f1:
    file1_lines = f1.readlines()

with open(input_file, 'r') as f2:
    file2_lines = f2.readlines()

different_predictions_count = 0
total_samples_count = len(file1_lines)  

for line1, line2 in zip(file1_lines[1:], file2_lines[1:]):
    prediction1 = line1.strip().split('\t')[1]  
    prediction2 = line2.strip().split('\t')[1]

    if prediction1 != prediction2:
        different_predictions_count += 1

# 计算不同预测值的比例
different_predictions_ratio = different_predictions_count / total_samples_count

print(f"准确率: {different_predictions_ratio}")


准确率: 0.5058565153733529
