# 第9章 質問応答

## 9.4 文書検索モデルの実装

### 9.4.3 BPR の実装

#### 準備

In [None]:
!pip install datasets torch transformers[ja,torch]

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/486.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m481.3/486.2 kB[0m [31m17.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting transformers[ja,torch]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB

In [None]:
from transformers.trainer_utils import set_seed

# 乱数のシードを設定する
set_seed(42)

#### データセットの読み込みと前処理

In [None]:
from datasets import load_dataset

# Hugging Face Hubのllm-book/aio-retrieverのリポジトリから
# AI王データセットの訓練セットを読み込む
train_dataset = load_dataset("llm-book/aio-retriever", split="train")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Downloading builder script:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading and preparing dataset aio/default to /root/.cache/huggingface/datasets/llm-book___aio/default/1.0.0/cac17d698929ff1c91bb05aa042dc0452dbe353625c90e29071828a6e3775f9c...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/637M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/22335 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset aio downloaded and prepared to /root/.cache/huggingface/datasets/llm-book___aio/default/1.0.0/cac17d698929ff1c91bb05aa042dc0452dbe353625c90e29071828a6e3775f9c. Subsequent calls will reuse this data.


In [None]:
# 読み込まれた訓練セットの形式と事例数を確認する
print(train_dataset)

Dataset({
    features: ['qid', 'competition', 'timestamp', 'section', 'number', 'original_question', 'original_answer', 'original_additional_info', 'question', 'answers', 'passages', 'positive_passage_indices', 'negative_passage_indices'],
    num_rows: 22335
})


In [None]:
from pprint import pprint

# 読み込まれた訓練セットの内容を確認する
pprint(train_dataset[0])

{'answers': ['26文字'],
 'competition': 'abc ～the first～',
 'negative_passage_indices': [1,
                              2,
                              3,
                              4,
                              5,
                              6,
                              7,
                              8,
                              9,
                              10,
                              11,
                              12,
                              13,
                              14,
                              15,
                              16,
                              17,
                              18,
                              19,
                              20,
                              21,
                              22,
                              23,
                              24,
                              25,
                              26,
                              27,
                              28,


In [None]:
# 訓練セットから正例とハード負例のパッセージを持たない事例を除外する
train_dataset = train_dataset.filter(
    lambda x: (
        len(x["positive_passage_indices"]) > 0
        and len(x["negative_passage_indices"]) > 0
    )
)

Filter:   0%|          | 0/22335 [00:00<?, ? examples/s]

In [None]:
def filter_passages(example: dict) -> dict:
    """訓練セットの各事例で、正例のパッセージを最初の一つだけ残す"""
    example["positive_passage_indices"] = [
        example["positive_passage_indices"][0]
    ]
    return example

train_dataset = train_dataset.map(filter_passages)

Map:   0%|          | 0/19596 [00:00<?, ? examples/s]

In [None]:
# 前処理後の訓練データの形式と事例数を確認する
print(train_dataset)

Dataset({
    features: ['qid', 'competition', 'timestamp', 'section', 'number', 'original_question', 'original_answer', 'original_additional_info', 'question', 'answers', 'passages', 'positive_passage_indices', 'negative_passage_indices'],
    num_rows: 19596
})


In [None]:
# Hugging Face Hubのllm-book/aio-retrieverのリポジトリから
# AI王データセットの検証セットを読み込む
valid_dataset = load_dataset(
    "llm-book/aio-retriever", split="validation"
)



In [None]:
# 読み込まれた検証データの形式と事例数を確認する
print(valid_dataset)

Dataset({
    features: ['qid', 'competition', 'timestamp', 'section', 'number', 'original_question', 'original_answer', 'original_additional_info', 'question', 'answers', 'passages', 'positive_passage_indices', 'negative_passage_indices'],
    num_rows: 1000
})


In [None]:
# 検証セットから正例とハード負例のパッセージを持たない事例を除外する
valid_dataset = valid_dataset.filter(
    lambda x: (
        len(x["positive_passage_indices"]) > 0
        and len(x["negative_passage_indices"]) > 0
    )
)

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
def filter_passages(example: dict) -> dict:
    """検証セットの各事例で、正例とハード負例のパッセージを最初の一つだけ残す"""
    example["positive_passage_indices"] = [
        example["positive_passage_indices"][0]
    ]
    example["negative_passage_indices"] = [
        example["negative_passage_indices"][0]
    ]
    return example

valid_dataset = valid_dataset.map(filter_passages)

Map:   0%|          | 0/864 [00:00<?, ? examples/s]

In [None]:
# 前処理後の検証データの形式と事例数を確認する
print(valid_dataset)

Dataset({
    features: ['qid', 'competition', 'timestamp', 'section', 'number', 'original_question', 'original_answer', 'original_additional_info', 'question', 'answers', 'passages', 'positive_passage_indices', 'negative_passage_indices'],
    num_rows: 864
})


#### トークナイザと collate 関数の準備

In [None]:
from transformers import AutoTokenizer

# Hugging Face Hubにおけるモデル名を指定する
base_model_name = "cl-tohoku/bert-base-japanese-v3"
# モデル名からトークナイザを初期化する
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

In [None]:
import random
import torch
from torch import Tensor
from transformers import BatchEncoding

def collate_fn(
    examples: list[dict],
) -> dict[str, BatchEncoding | Tensor]:
    """BPRの訓練・検証データのミニバッチを作成"""
    questions: list[str] = []
    passage_titles: list[str] = []
    passage_texts: list[str] = []

    for example in examples:
        questions.append(example["question"])

        # 正例とハード負例のパッセージを一つずつ取り出す
        positive_passage_idx = random.choice(
            example["positive_passage_indices"]
        )
        negative_passage_idx = random.choice(
            example["negative_passage_indices"]
        )

        passage_titles.extend(
            [
                example["passages"][positive_passage_idx]["title"],
                example["passages"][negative_passage_idx]["title"],
            ]
        )
        passage_texts.extend(
            [
                example["passages"][positive_passage_idx]["text"],
                example["passages"][negative_passage_idx]["text"],
            ]
        )

    # 質問とパッセージにトークナイザを適用
    tokenized_questions = tokenizer(
        questions,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )
    tokenized_passages = tokenizer(
        passage_titles,
        passage_texts,
        padding=True,
        truncation="only_second",
        max_length=256,
        return_tensors="pt",
    )

    # 質問とパッセージのスコア行列における正例の位置を示すTensorを作成する
    # 行列の [0, 1, 2, ..., len(questions) - 1] 行目の事例（質問）に対して
    # [0, 2, 4, ..., 2 * (len(questions) - 1)] 列目の要素（パッセージ）が
    # 正例となる
    labels = torch.arange(0, 2 * len(questions), 2)
    return {
        "tokenized_questions": tokenized_questions,
        "tokenized_passages": tokenized_passages,
        "labels": labels,
    }

#### モデルの準備

In [None]:
import math
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
from transformers.utils import ModelOutput

class BPRModel(nn.Module):
    """BPRのモデル"""

    def __init__(self, base_model_name: str):
        """モデルの初期化"""
        super().__init__()

        # 質問エンコーダ
        self.question_encoder = AutoModel.from_pretrained(
            base_model_name
        )
        # パッセージエンコーダ
        self.passage_encoder = AutoModel.from_pretrained(
            base_model_name
        )

        # モデルの訓練ステップ数（損失の計算時に使用）
        self.global_step = 0

    def binary_encode(self, x: Tensor) -> Tensor:
        """実数埋め込みをバイナリ埋め込みに変換"""
        if self.training:
            # 訓練時: 符号関数を近似したtanh関数によりベクトルの変換を行う
            return torch.tanh(
                x * math.pow((1.0 + self.global_step * 0.1), 0.5)
            )
        else:
            # 評価時: 符号関数によりベクトルの2値化を行う
            return torch.where(x >= 0, 1.0, -1.0).to(x.device)

    def encode_questions(
        self, tokenized_questions: BatchEncoding
    ) -> tuple[Tensor, Tensor]:
        """質問を実数埋め込みとバイナリ埋め込みに変換"""
        encoded_questions = self.question_encoder(
            **tokenized_questions
        ).last_hidden_state[:, 0]
        binary_encoded_questions = self.binary_encode(
            encoded_questions
        )
        return encoded_questions, binary_encoded_questions

    def encode_passages(
        self, tokenized_passages: BatchEncoding
    ) -> Tensor:
        """パッセージをバイナリ埋め込みに変換"""
        encoded_passages = self.passage_encoder(
            **tokenized_passages
        ).last_hidden_state[:, 0]
        binary_encoded_passages = self.binary_encode(encoded_passages)
        return binary_encoded_passages

    def compute_loss(
        self,
        encoded_questions: Tensor,
        binary_encoded_questions: Tensor,
        binary_encoded_passages: Tensor,
        labels: Tensor,
    ) -> Tensor:
        """BPRの損失を計算"""
        num_questions = encoded_questions.size(0)
        num_passages = binary_encoded_passages.size(0)

        # 候補パッセージ生成の損失を計算する
        # 質問のバイナリ埋め込みとパッセージのバイナリ埋め込みの内積を
        # スコアに用いて、正例パッセージのスコアと負例パッセージのスコアの
        # ランキング損失を計算する
        binary_scores = torch.matmul(
            binary_encoded_questions,
            binary_encoded_passages.transpose(0, 1),
        )
        positive_mask = F.one_hot(
            labels, num_classes=num_passages
        ).bool()
        positive_binary_scores = torch.masked_select(
            binary_scores, positive_mask
        ).repeat_interleave(num_passages - 1)
        negative_binary_scores = torch.masked_select(
            binary_scores, ~positive_mask
        )
        target = torch.ones_like(positive_binary_scores).long()
        loss_cand = F.margin_ranking_loss(
            positive_binary_scores,
            negative_binary_scores,
            target,
            margin=0.1,
        )

        # 候補パッセージのリランキングの損失を計算する
        # 質問の実数埋め込みとパッセージのバイナリ埋め込みの内積を
        # スコアに用いて、正例パッセージのスコアと負例パッセージのスコアの
        # 交差エントロピー損失を計算する
        dense_scores = torch.matmul(
            encoded_questions, binary_encoded_passages.transpose(0, 1)
        )
        loss_rerank = F.cross_entropy(dense_scores, labels)

        loss = loss_cand + loss_rerank
        return loss

    def forward(
        self,
        tokenized_questions: BatchEncoding,
        tokenized_passages: BatchEncoding,
        labels: Tensor,
    ) -> ModelOutput:
        """モデルの前向き計算を定義"""
        # 質問とパッセージを埋め込みに変換する
        encoded_questions, binary_encoded_questions = (
            self.encode_questions(tokenized_questions)
        )
        binary_encoded_passages = self.encode_passages(
            tokenized_passages
        )

        # BPRの損失を計算する
        loss = self.compute_loss(
            encoded_questions,
            binary_encoded_questions,
            binary_encoded_passages,
            labels,
        )

        # モデルの訓練ステップ数のカウンタを増やす
        if self.training:
            self.global_step += 1

        return ModelOutput(loss=loss)

# BPRのモデルを初期化する
model = BPRModel(base_model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/447M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-v3 were not used when initializing BertM

#### `Trainer` の準備

In [None]:
from transformers import TrainingArguments

# BPRの訓練のハイパーパラメータを設定する
training_args = TrainingArguments(
    output_dir="outputs_bpr",  # 結果の保存先フォルダ
    per_device_train_batch_size=32,  # 訓練時のバッチサイズ
    per_device_eval_batch_size=32,  # 評価時のバッチサイズ
    learning_rate=1e-5,  # 学習率
    max_grad_norm=2.0,  # 勾配クリッピングにおけるノルムの最大値
    num_train_epochs=20,  # 訓練エポック数
    warmup_ratio=0.1,  # 学習率のウォームアップを行う長さ
    lr_scheduler_type="linear",  # 学習率のスケジューラの種類
    evaluation_strategy="epoch",  # 検証セットによる評価のタイミング
    logging_strategy="epoch",  # ロギングのタイミング
    save_strategy="epoch",  # チェックポイントの保存のタイミング
    save_total_limit=1,  # 保存するチェックポイントの最大数
    fp16=True,  # 自動混合精度演算の有効化
    remove_unused_columns=False,  # データセットの不要フィールドを削除するか
)

In [None]:
from transformers import Trainer

# BPRのTrainerを初期化する
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

#### 訓練の実行

In [None]:
# BPRの訓練を行う
trainer.train()



Epoch,Training Loss,Validation Loss
1,4.1877,1.985377
2,0.5772,1.368297
3,0.3962,1.394388
4,0.2943,1.274348
5,0.2318,1.488862
6,0.1992,1.384777
7,0.1719,1.386523
8,0.1484,1.442091
9,0.1379,1.557213
10,0.1278,1.51152


TrainOutput(global_step=12260, training_loss=0.36365611525966524, metrics={'train_runtime': 6121.0281, 'train_samples_per_second': 64.028, 'train_steps_per_second': 2.003, 'total_flos': 0.0, 'train_loss': 0.36365611525966524, 'epoch': 20.0})

#### トークナイザとモデルの保存

In [None]:
# 質問エンコーダを保存
question_encoder_path = "outputs_bpr/question_encoder"
model.question_encoder.save_pretrained(question_encoder_path)
tokenizer.save_pretrained(question_encoder_path)

# パッセージエンコーダを保存
passage_encoder_path = "outputs_bpr/passage_encoder"
model.passage_encoder.save_pretrained(passage_encoder_path)
tokenizer.save_pretrained(passage_encoder_path)

('outputs_bpr/passage_encoder/tokenizer_config.json',
 'outputs_bpr/passage_encoder/special_tokens_map.json',
 'outputs_bpr/passage_encoder/vocab.txt',
 'outputs_bpr/passage_encoder/added_tokens.json')

#### Google ドライブへの保存

In [None]:
from google.colab import drive

# Googleドライブをマウントする
drive.mount("drive")

Mounted at drive


In [None]:
# 保存されたモデルをGoogleドライブのフォルダにコピーする
!mkdir -p drive/MyDrive/llm-book
!cp -r outputs_bpr drive/MyDrive/llm-book