# Hugging Face Transformers 微调语言模型-问答任务-Homework

微调训练一个支持问答任务的模型。

**注意：微调后的模型仍然是通过提取上下文的子串来回答问题的，而不是生成新的文本。**

# 1.准备

### 公共

In [4]:
# 根据你使用的模型和GPU资源情况，调整以下关键参数
squad_v2 = True
model_checkpoint = "distilbert-base-uncased"
batch_size = 64

# 数据集
dataset_dir = "/mnt/workspace/dataset"
dataset_squad_v2 = f"{dataset_dir}/rajpurkar/squad_v2"
dataset_squad_v2_root = f"{dataset_dir}/rajpurkar/squad_v2/squad_v2"

# 模型
model_dir = "/mnt/workspace/models"
model_distilbert_base_uncased = f"{model_dir}/distilbert/distilbert-base-uncased/"
model_distilbert_base_uncased_output_dir = f"{model_dir}/distilbert/distilbert-base-uncased-fine-tune-squad"

# 文本序列最大长度
max_length = 384
# 拆分上下文时，拆分的重叠长度
doc_stride = 128

### 加载数据集、编码器

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer
import transformers

# 加载 SQuAD 数据集
datasets = load_dataset(dataset_squad_v2)

# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_distilbert_base_uncased)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

# 数据填充
pad_on_right = tokenizer.padding_side == "right"

### 数据预处理

#### 处理函数

In [6]:
# 数据预处理
def prepare_train_features(examples):
    # 一些问题的左侧可能有很多空白字符，这对我们没有用，而且会导致上下文的截断失败
    # （标记化的问题将占用大量空间）。因此，我们删除左侧的空白字符。
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # 使用截断和填充对我们的示例进行标记化，但保留溢出部分，使用步幅（stride）。
    # 当上下文很长时，这会导致一个示例可能提供多个特征，其中每个特征的上下文都与前一个特征的上下文有一些重叠。
    # truncation，文本截断的方式，它使得始终对context进行截取
    # 标记后的数据格式 {'input_ids': [[...]], 'attention_mask': [[...]], 'offset_mapping': [[ ...],[...]],'overflow_to_sample_mapping': [0, 0]}
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # 由于一个示例可能给我们提供多个特征（如果它具有很长的上下文），我们需要一个从特征到其对应示例的映射。这个键就提供了这个映射关系。
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # 偏移映射将为我们提供从令牌到原始上下文中的字符位置的映射。这将帮助我们计算开始位置和结束位置。
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # 让我们为这些示例进行标记！
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # 我们将使用 CLS 特殊 token 的索引来标记不可能的答案。
        input_ids = tokenized_examples["input_ids"][i]
        # [CLS] 标记的索引位置
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # 获取与该示例对应的序列（以了解上下文和问题是什么）。
        sequence_ids = tokenized_examples.sequence_ids(i)

        # 一个示例可以提供多个跨度，这是包含此文本跨度的示例的索引。
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # 如果没有给出答案，则将cls_index设置为答案。
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # 答案在文本中的开始和结束字符索引。
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # 当前跨度在文本中的开始令牌索引。
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # 当前跨度在文本中的结束令牌索引。
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # 检测答案是否超出跨度（在这种情况下，该特征的标签将使用CLS索引）。
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # 否则，将token_start_index和token_end_index移到答案的两端。
                # 注意：如果答案是最后一个单词（边缘情况），我们可以在最后一个偏移之后继续。
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

#### 数据处理

In [7]:
# datasets.map 处理数据集
tokenized_datasets = datasets.map(
    prepare_train_features,
    batched=True,
    remove_columns=datasets["train"].column_names
)




Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

# 2.微调模型

### 微调参数设置

In [8]:
import os
import wandb
import torch
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

os.environ["WANDB_PROJECT"]="llm-dev-qa-fine-tune"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="true"

#  fuck,v100貌似不支持tf32,所以下面超参数里也注释掉
# torch.backends.cuda.matmul.allow_tf32 = True
# torch.backends.cudnn.allow_tf32 = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.empty_cache()

print(device)
print(n_gpu)

# 加载要训练的模型
model = AutoModelForQuestionAnswering.from_pretrained(model_distilbert_base_uncased)

# Data Collator（数据整理器）
data_collator = default_data_collator

# 训练超参数（TrainingArguments）
args = TrainingArguments(
    output_dir=model_distilbert_base_uncased_output_dir,
    # 训练过程中进行评估的策略。在这里设置为 "epoch"，表示在每个训练轮次结束后进行一次评估。也可以设置为 "steps"，表示每训练一定步数后进行评估。
    evaluation_strategy = "epoch",
    # 设置模型的初始学习率
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # 设置训练的轮次（epochs）数
    num_train_epochs=3,
    # 设置权重衰减（weight decay）的系数。权重衰减是一种正则化技术，用于避免模型过拟合训练数据。这里设置为 0.01，表示权重衰减的系数为 0.01
    weight_decay=0.01,
    # tf32=True,
    save_total_limit=2, # 控制生成checkpoints文件夹的数量
    hub_strategy="checkpoint",
    save_strategy="steps",
    save_steps=500,
    ignore_data_skip=False,
    report_to="wandb",
    logging_steps=50
)

# 实例化训练器（Trainer）
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

2024-03-30 12:12:32.708323: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-30 12:12:33.150212: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-30 12:12:33.150245: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-30 12:12:33.153198: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-30 12:12:33.404714: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-30 12:12:33.406879: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

cuda
1


  return self.fget.__get__(instance, owner)()
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at /mnt/workspace/models/distilbert/distilbert-base-uncased/ and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.19.24, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [88]:
# 开始训练  watch -n 1 nvidia-smi
# - SQUAD v2
# - model_checkpoint = "distilbert-base-uncased"
# - batch_size = 64
trainer.train()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112262000016119, max=1.0…

Epoch,Training Loss,Validation Loss
1,1.3623,1.324668
2,1.1949,1.290374
3,1.0931,1.330887


Detected kernel version 4.19.24, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


TrainOutput(global_step=6177, training_loss=1.3704667433346092, metrics={'train_runtime': 4318.7703, 'train_samples_per_second': 91.522, 'train_steps_per_second': 1.43, 'total_flos': 3.873165421863629e+16, 'train_loss': 1.3704667433346092, 'epoch': 3.0})

In [89]:
# 训练完成后，第一时间保存模型权重文件。
model_to_save = trainer.save_model(model_distilbert_base_uncased_output_dir)

# 3.模型评估

**评估模型输出需要一些额外的处理：将模型的预测映射回上下文的部分。**

模型直接输出的是预测答案的`起始位置`和`结束位置`的**logits**

## 函数定义

### prepare_validation_features

In [25]:
def prepare_validation_features(examples):
    # 一些问题的左侧有很多空白，这些空白并不有用且会导致上下文截断失败（分词后的问题会占用很多空间）。
    # 因此我们移除这些左侧空白
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # 使用截断和可能的填充对我们的示例进行分词，但使用步长保留溢出的令牌。这导致一个长上下文的示例可能产生
    # 几个特征，每个特征的上下文都会稍微与前一个特征的上下文重叠。
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # 由于一个示例在上下文很长时可能会产生几个特征，我们需要一个从特征映射到其对应示例的映射。这个键就是为了这个目的。
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # 我们保留产生这个特征的示例ID，并且会存储偏移映射。
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # 获取与该示例对应的序列（以了解哪些是上下文，哪些是问题）。
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # 一个示例可以产生几个文本段，这里是包含该文本段的示例的索引。
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # 将不属于上下文的偏移映射设置为None，以便容易确定一个令牌位置是否属于上下文。
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

### postprocess_qa_predictions

In [24]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # 构建一个从示例到其对应特征的映射。
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # 我们需要填充的字典。
    predictions = collections.OrderedDict()

    # 日志记录。
    print(f"正在后处理 {len(examples)} 个示例的预测，这些预测分散在 {len(features)} 个特征中。")

    # 遍历所有示例！
    for example_index, example in enumerate(tqdm(examples)):
        # 这些是与当前示例关联的特征的索引。
        feature_indices = features_per_example[example_index]

        min_null_score = None # 仅在squad_v2为True时使用。
        valid_answers = []
        
        context = example["context"]
        # 遍历与当前示例关联的所有特征。
        for feature_index in feature_indices:
            # 我们获取模型对这个特征的预测。
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # 这将允许我们将logits中的某些位置映射到原始上下文中的文本跨度。
            offset_mapping = features[feature_index]["offset_mapping"]

            # 更新最小空预测。
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # 浏览所有的最佳开始和结束logits，为 `n_best_size` 个最佳选择。
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # 不考虑超出范围的答案，原因是索引超出范围或对应于输入ID的部分不在上下文中。
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # 不考虑长度小于0或大于max_answer_length的答案。
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # 在极少数情况下我们没有一个非空预测，我们创建一个假预测以避免失败。
            best_answer = {"text": "", "score": 0.0}
        
        # 选择我们的最终答案：最佳答案或空答案（仅适用于squad_v2）
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

### 其他

In [31]:
validation_features = datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)

In [38]:
validation_features

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 12134
})

In [39]:
raw_predictions = trainer.predict(validation_features)

In [40]:
validation_features

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'example_id'],
    num_rows: 12134
})

In [43]:
# Trainer会隐藏模型不使用的列（在这里是example_id和offset_mapping，我们需要它们进行后处理），
# 所以我们需要将它们重新设置回来
# 这也没隐藏啊，蛋疼
# validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))


#### 打印比较模型输出和标准答案（Ground-truth）是否一致: 现在不一致咋办？？

In [52]:
import torch

def getAnswer(myTrainer):

    for batch in myTrainer.get_eval_dataloader():
        break
    batch = {k: v.to(myTrainer.args.device) for k, v in batch.items()}
    with torch.no_grad():
        output = myTrainer.model(**batch)
    output.keys()
    
    
    n_best_size = 20
    max_answer_length = 30
    start_logits = output.start_logits[0].cpu().numpy()
    end_logits = output.end_logits[0].cpu().numpy()
    offset_mapping = validation_features[0]["offset_mapping"]

    # 第一个特征来自第一个示例。对于更一般的情况，我们需要将example_id匹配到一个示例索引
    context = datasets["validation"][0]["context"]

    # 收集最佳开始/结束逻辑的索引：
    start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
    end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
    valid_answers = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            # 不考虑超出范围的答案，原因是索引超出范围或对应于输入ID的部分不在上下文中。
            if (
                start_index >= len(offset_mapping)
                or end_index >= len(offset_mapping)
                or offset_mapping[start_index] is None
                or offset_mapping[end_index] is None
            ):
                continue
            # 不考虑长度小于0或大于max_answer_length的答案。
            if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                continue
            if start_index <= end_index: # 我们需要细化这个测试，以检查答案是否在上下文中
                start_char = offset_mapping[start_index][0]
                end_char = offset_mapping[end_index][1]
                valid_answers.append(
                    {
                        "score": start_logits[start_index] + end_logits[end_index],
                        "text": context[start_char: end_char]
                    }
                )

    valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
    return valid_answers

In [53]:
valid_answers= getAnswer(trainer)
valid_answers[0]

{'score': 0.80216366, 'text': 'in the first half'}

In [49]:
datasets["validation"][0]["answers"]

{'text': ['France', 'France', 'France', 'France'],
 'answer_start': [159, 159, 159, 159]}

#### 输出f1

In [41]:
import collections

examples = datasets["validation"]
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

# 在原始结果上应用后处理问答结果
final_predictions = postprocess_qa_predictions(datasets["validation"], validation_features, raw_predictions.predictions)

正在后处理 11873 个示例的预测，这些预测分散在 12134 个特征中。


  0%|          | 0/11873 [00:00<?, ?it/s]

使用 `datasets.load_metric` 中加载 `SQuAD v2` 的评估指标

In [42]:
from datasets import load_metric

metric = load_metric("squad_v2" if squad_v2 else "squad",trust_remote_code=True)

if squad_v2:
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
metric.compute(predictions=formatted_predictions, references=references)

Using the latest cached version of the module from /root/.cache/huggingface/modules/datasets_modules/metrics/squad_v2/2331e6999295d76b19484c2e2bec2c45e44361978b4622449ee990f0bb47ba5e (last modified on Wed Mar 20 09:57:23 2024) since it couldn't be found locally at squad_v2, or remotely on the Hugging Face Hub.


{'exact': 0.8927819422218479,
 'f1': 4.671281408230188,
 'total': 11873,
 'HasAns_exact': 0.20242914979757085,
 'HasAns_f1': 7.770263859635135,
 'HasAns_total': 5928,
 'NoAns_exact': 1.5811606391925987,
 'NoAns_f1': 1.5811606391925987,
 'NoAns_total': 5945,
 'best_exact': 50.07159100480081,
 'best_exact_thresh': 0.0,
 'best_f1': 50.08135341454637,
 'best_f1_thresh': 0.0}

## 6.Homework：加载本地保存的模型，进行评估和再训练更高的 F1 Score

In [14]:
# 再训练,使用删词训练完的模型
trained_model = AutoModelForQuestionAnswering.from_pretrained(model_distilbert_base_uncased_output_dir)

In [15]:
# 实例化训练器（Trainer）
trained_trainer = Trainer(
    trained_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Detected kernel version 4.19.24, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [140]:
# 清理GPU
# del trainer
torch.cuda.empty_cache()

In [16]:
# 开始训练  watch -n 1 nvidia-smi
# - SQUAD v2
# - model_checkpoint = "distilbert-base-uncased-fine-tune-squad"
# - batch_size = 64
trained_trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdouspeng[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112331244415448, max=1.0…

Epoch,Training Loss,Validation Loss
1,0.858,1.449725
2,0.8167,1.466305
3,0.804,1.549576


wandb: Network error (TransientError), entering retry loop.
Detected kernel version 4.19.24, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


TrainOutput(global_step=6177, training_loss=0.8256300647841199, metrics={'train_runtime': 4302.6013, 'train_samples_per_second': 91.866, 'train_steps_per_second': 1.436, 'total_flos': 3.873165421863629e+16, 'train_loss': 0.8256300647841199, 'epoch': 3.0})

In [17]:
# 训练完成后，第一时间保存模型权重文件。
model_distilbert_base_uncased_fine_tune_2_output_dir = f"{model_dir}/distilbert/distilbert-base-uncased-fine-tune-squad-2"
model2_to_save = trained_trainer.save_model(model_distilbert_base_uncased_fine_tune_2_output_dir)

## 7.再次评估

### 处理

In [38]:
validation_features = datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)

In [39]:
raw_predictions = trained_trainer.predict(validation_features)

In [40]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [41]:
final_predictions = postprocess_qa_predictions(datasets["validation"], validation_features, raw_predictions.predictions)

正在后处理 11873 个示例的预测，这些预测分散在 12134 个特征中。


  0%|          | 0/11873 [00:00<?, ?it/s]

In [29]:
metric = load_metric("squad_v2" if squad_v2 else "squad",trust_remote_code=True)

  metric = load_metric("squad_v2" if squad_v2 else "squad",trust_remote_code=True)


In [42]:
if squad_v2:
    formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
else:
    formatted_predictions = [{"id": k, "prediction_text": v} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets["validation"]]
metric.compute(predictions=formatted_predictions, references=references)

{'exact': 61.91358544596985,
 'f1': 65.4797797475251,
 'total': 11873,
 'HasAns_exact': 66.19433198380567,
 'HasAns_f1': 73.33694752738931,
 'HasAns_total': 5928,
 'NoAns_exact': 57.64507989907485,
 'NoAns_f1': 57.64507989907485,
 'NoAns_total': 5945,
 'best_exact': 61.938852859428955,
 'best_exact_thresh': 0.0,
 'best_f1': 65.49662468983149,
 'best_f1_thresh': 0.0}