# Chapter 5. Fine-tuning Large Language Models

## 5.4 Implementation of natural language inference, semantic similarity calculation, and multiple choice question answering models

### 5.4.3 Multiple Choice Question Answer

In [1]:
!pip -q install datasets transformers[ja,torch] matplotlib scikit-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m600.9/600.9 kB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.4/47.4 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m72.0 MB/s[

In [2]:
from transformers import BatchEncoding

def preprocess_multiple_choice(
    example: dict[str, str]
) -> BatchEncoding:
    """Convert multiple-choice question answering examples to IDs"""
    # Calculate the number of choices based on the number of keys starting with "choice"
    num_choices = sum(
        key.startswith("choice") for key in example.keys()
    )

    # Concatenate the question with each choice and pass to the tokenizer
    choice_list = [example[f"choice{i}"] for i in range(num_choices)]  # Each choice
    repeated_question_list = [example["question"]] * num_choices  # The question repeated for each choice
    encoded_example = tokenizer(
        repeated_question_list, choice_list, max_length=64
    )

    # If a label is included in the input, add it to the output
    if "label" in example:
        encoded_example["labels"] = example["label"]
    return encoded_example

In [3]:
import torch
from transformers import BatchEncoding

def collate_fn_multiple_choice(
    features: list[BatchEncoding],
) -> dict[str, torch.Tensor]:
    """Build a mini-batch from inputs for multiple-choice question answering"""
    # Align with the preprocess_multiple_choice function by setting the label name to "labels"
    label_name = "labels"

    batch_size = len(features)
    num_choices = len(features[0]["input_ids"])

    # Gather inputs for each choice into a single list
    flat_features = []
    for feature in features:
        flat_features += [
            {k: v[i] for k, v in feature.items() if k != label_name}
            for i in range(num_choices)
        ]

    # Perform padding for inputs for each choice
    flat_batch = tokenizer.pad(flat_features, return_tensors="pt")

    # Group inputs for each choice per original batch
    # Transform Tensor shape from (batch_size * num_choices, max_sequence_length)
    # to (batch_size, num_choices, max_sequence_length)
    batch = {
        k: v.view(batch_size, num_choices, -1)
        for k, v in flat_batch.items()
    }

    # If labels are included in the input, gather them in a batch and convert to Tensor
    if label_name in features[0]:
        labels = [feature[label_name] for feature in features]
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
    return batch

#### Obtaining Model Prediction Results

In [4]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer

model_name = "llm-book/bert-base-japanese-v3-jcommonsenseqa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMultipleChoice.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

In [5]:
from transformers import AutoModelForMultipleChoice, AutoTokenizer

def pipeline_multiple_choice(
    examples: dict[str, str] | list[dict[str, str]]
) -> list[dict[str, str]]:
    """Predict for multiple-choice question answering examples"""
    # If a single dict input is given, wrap it in a list
    if isinstance(examples, dict):
        examples = [examples]

    # Convert examples to the input format of the model
    encoded_examples = [
        preprocess_multiple_choice(e) for e in examples
    ]
    batch = collate_fn_multiple_choice(encoded_examples)

    # Move data to the device (CPU/GPU) used by the model
    batch = {k: v.to(model.device) for k, v in batch.items()}

    # Forward computation of the model
    model_output = model.forward(**batch)

    # Obtain the choice strings and prediction probabilities from the model output
    predicted_ids = model_output.logits.argmax(dim=-1).tolist()
    probs = torch.softmax(model_output.logits, dim=-1)
    predicted_probs = [ps[i].item() for ps, i in zip(probs, predicted_ids)]
    predictions = [
        {"prediction": e[f"choice{i}"], "pred_prob": p}
        for e, i, p in zip(examples, predicted_ids, predicted_probs)
    ]

    return predictions

In [6]:
from datasets import load_dataset

valid_dataset = load_dataset(
    "llm-book/JGLUE", name="JCommonsenseQA", split="validation"
)

Downloading builder script:   0%|          | 0.00/13.8k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/3.08k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/9.03k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/62.3k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [7]:
model = model.to("cuda:0")

In [8]:
from tqdm import tqdm

# ClassLabel instance to retrieve label name information
class_label = valid_dataset.features["label"]

results: list[dict[str, float | str]] = []
for i, example in tqdm(enumerate(valid_dataset)):
    # Obtain the model's prediction results
    model_prediction = pipeline_multiple_choice(example)[0]
    # Retrieve the correct answer string
    true_label = example["label"]
    correct_answer = example[f"choice{true_label}"]
    # Store necessary information for analysis in results
    results.append(
        {
            "example_id": i,
            "pred_prob": model_prediction["pred_prob"],
            "prediction": model_prediction["prediction"],
            "correct_answer": correct_answer,
        }
    )

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
470it [00:09, 60.36it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
1119it [00:22, 49.89it/s]


#### Analyzing Overall Trends

#### Error Analysis

In [9]:
# Collect examples where predictions were incorrect
failed_results = [
    res for res in results if res["prediction"] != res["correct_answer"]
]
# Sort by the model's prediction probability in descending order
sorted_failed_results = sorted(
    failed_results, key=lambda x: -x["pred_prob"]
)
# Display the top examples with high prediction probability but incorrect predictions
for top_result in sorted_failed_results[:5]:
    question = valid_dataset[top_result["example_id"]]["question"]

    print(f"問題 (Question): {question}")

    print(f"予測 (Prediction): {top_result['prediction']}")
    print(f"正解 (Correct Answer): {top_result['correct_answer']}")
    print(f"予測確率 (Prediction Probability): {top_result['pred_prob']:.4f}")
    print("----------------")

問題 (Question): 何かに通るためにチャレンジする事とは？
予測 (Prediction): 合格する
正解 (Correct Answer): 受験
予測確率 (Prediction Probability): 0.9998
----------------
問題 (Question): 朝に飲む汁物を何と呼ぶ？
予測 (Prediction): おかゆ
正解 (Correct Answer): スープ
予測確率 (Prediction Probability): 0.9997
----------------
問題 (Question): 夜空に見える一番大きいものは？
予測 (Prediction): 星
正解 (Correct Answer): お月様
予測確率 (Prediction Probability): 0.9989
----------------
問題 (Question): 芝居を行う建物を何という？
予測 (Prediction): スタジオ
正解 (Correct Answer): 演芸場
予測確率 (Prediction Probability): 0.9984
----------------
問題 (Question): 場所取りする宴会とは？
予測 (Prediction): パーティ
正解 (Correct Answer): 花見
予測確率 (Prediction Probability): 0.9971
----------------
