# 第5章 大規模言語モデルのファインチューニング

## 5.4 自然言語推論・意味的類似度計算・多肢選択式質問応答モデルの実装

### 5.4.3 多肢選択式質問応答

In [None]:
!pip install datasets transformers[ja,torch] matplotlib scikit-learn

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[ja,torch]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m85.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K   

In [None]:
from transformers import BatchEncoding

def preprocess_multiple_choice(
    example: dict[str, str]
) -> BatchEncoding:
    """多肢選択式質問応答の事例をIDに変換"""
    # 選択肢の数を"choice"から始まるキーの数として算出
    num_choices = sum(
        key.startswith("choice") for key in example.keys()
    )

    # 質問と選択肢を連結してトークナイザーに渡す
    choice_list = [example[f"choice{i}"] for i in range(num_choices)]
    repeated_question_list = [example["question"]] * num_choices
    encoded_example = tokenizer(
        repeated_question_list, choice_list, max_length=64
    )

    # ラベルが入力に含まれている場合に、出力にも追加
    if "label" in example:
        encoded_example["labels"] = example["label"]
    return encoded_example

In [None]:
import torch
from transformers import BatchEncoding

def collate_fn_multiple_choice(
    features: list[BatchEncoding],
) -> dict[str, torch.Tensor]:
    """選択肢式質問応答の入力からミニバッチを構築"""
    # preprocess_multiple_choice関数に合わせてラベル名を"labels"とする
    label_name = "labels"

    batch_size = len(features)
    num_choices = len(features[0]["input_ids"])

    # 選択肢ごとの入力を一つのlistにまとめる
    flat_features = []
    for feature in features:
        flat_features += [
            {k: v[i] for k, v in feature.items() if k != label_name}
            for i in range(num_choices)
        ]

    # 選択肢ごとの入力についてパディングを行う
    flat_batch = tokenizer.pad(flat_features, return_tensors="pt")

    # 元のバッチごとに選択肢ごとの入力をまとめる
    # （バッチサイズ * 選択肢数, 最大系列長）の形をしたTensorを、
    # （バッチサイズ, 選択肢数, 最大系列長）に変換
    batch = {
        k: v.view(batch_size, num_choices, -1)
        for k, v in flat_batch.items()
    }

    # ラベルが入力に含まれている場合、バッチにまとめてTensorに変換
    if label_name in features[0]:
        labels = [feature[label_name] for feature in features]
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
    return batch

#### モデルの予測結果の取得

In [None]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer

model_name = "llm-book/bert-base-japanese-v3-jcommonsenseqa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMultipleChoice.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [None]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer

def pipeline_multiple_choice(
    examples: dict[str, str] | list[dict[str, str]]
) -> list[dict[str, str]]:
    """多肢選択式質問応答の事例について予測"""
    # 単一のdict入力が与えられたときに、listに格納する
    if isinstance(examples, dict):
        examples = [examples]

    # 事例をモデルの入力形式に変換
    encoded_examples = [
        preprocess_multiple_choice(e) for e in examples
    ]
    batch = collate_fn_multiple_choice(encoded_examples)

    # モデルが使用するデバイス上（CPU/GPU）にデータを移動
    batch = {k: v.to(model.device) for k, v in batch.items()}

    # モデルの前向き計算処理
    model_output = model.forward(**batch)

    # モデルの出力から、選択肢の文字列と予測確率を得る
    predicted_ids = model_output.logits.argmax(dim=-1).tolist()
    probs = torch.softmax(model_output.logits, dim=-1)
    predicted_probs = [ps[i].item() for ps, i in zip(probs, predicted_ids)]
    predictions = [
        {"prediction": e[f"choice{i}"], "pred_prob": p}
        for e, i, p in zip(examples, predicted_ids, predicted_probs)
    ]

    return predictions

In [None]:
from datasets import load_dataset

valid_dataset = load_dataset(
    "llm-book/JGLUE", name="JCommonsenseQA", split="validation"
)

Downloading builder script:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.90k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/9.00k [00:00<?, ?B/s]

Downloading and preparing dataset jglue/JCommonsenseQA to /root/.cache/huggingface/datasets/llm-book___jglue/JCommonsenseQA/1.1.0/afed02e914319785e72f3ea981b4bd3e00089f2361b1137820c183c6b8173edd...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/62.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset jglue downloaded and prepared to /root/.cache/huggingface/datasets/llm-book___jglue/JCommonsenseQA/1.1.0/afed02e914319785e72f3ea981b4bd3e00089f2361b1137820c183c6b8173edd. Subsequent calls will reuse this data.


In [None]:
model = model.to("cuda:0")

In [None]:
from tqdm import tqdm

# ラベル名の情報を取得するためのClassLabelインスタンス
class_label = valid_dataset.features["label"]

results: list[dict[str, float | str]] = []
for i, example in tqdm(enumerate(valid_dataset)):
    # モデルの予測結果を取得
    model_prediction = pipeline_multiple_choice(example)[0]
    # 正解の文字列を取得
    true_label = example["label"]
    correct_answer = example[f"choice{true_label}"]
    # resultsに分析に必要な情報を格納
    results.append(
        {
            "example_id": i,
            "pred_prob": model_prediction["pred_prob"],
            "prediction": model_prediction["prediction"],
            "correct_answer": correct_answer,
        }
    )

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
471it [00:09, 44.29it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
1119it [00:20, 53.57it/s]


#### 全体的な傾向の分析

#### エラー分析

In [None]:
# 予測が誤った事例を収集
failed_results = [
    res for res in results if res["prediction"] != res["correct_answer"]
]
# モデルの予測確率が高い順にソート
sorted_failed_results = sorted(
    failed_results, key=lambda x: -x["pred_prob"]
)
# 高い確率で予測しながら誤った事例の上位を表示
for top_result in sorted_failed_results[:5]:
    question = valid_dataset[top_result["example_id"]]["question"]

    print(f"問題：{question}")

    print(f"予測：{top_result['prediction']}")
    print(f"正解：{top_result['correct_answer']}")
    print(f"予測確率: {top_result['pred_prob']:.4f}")
    print("----------------")

問題：何かに通るためにチャレンジする事とは？
予測：合格する
正解：受験
予測確率: 0.9998
----------------
問題：朝に飲む汁物を何と呼ぶ？
予測：おかゆ
正解：スープ
予測確率: 0.9997
----------------
問題：夜空に見える一番大きいものは？
予測：星
正解：お月様
予測確率: 0.9989
----------------
問題：芝居を行う建物を何という？
予測：スタジオ
正解：演芸場
予測確率: 0.9984
----------------
問題：場所取りする宴会とは？
予測：パーティ
正解：花見
予測確率: 0.9971
----------------
