# Note

ensembleの多様性のため、Gemma2の訓練方法と異なりLlama3.1は生成AIとしての出力を調整することでFine-tuningを実施。  

Gemma2・・・分類用に出力層を追加、3値分類用にカスタム  
Llama3.1・・・モデルの構造変更なし、予測する単語を調整

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U transformers
!pip install -U tokenizers
!pip install -U peft
!pip install -U bitsandbytes
!pip install -U datasets

# Import

In [None]:
import os
import copy
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from datasets import Dataset
from scipy.special import softmax
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BitsAndBytesConfig,
    LlamaPreTrainedModel,
    LlamaModel,
    AutoTokenizer,
    PreTrainedTokenizerBase,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from transformers.modeling_outputs import CausalLMOutputWithPast
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType, PeftModel
from sklearn.metrics import log_loss, accuracy_score

import transformers
print(f"transformers: {transformers.__version__}")


transformers: 4.43.3


In [None]:
#事前学習済みGemma2モデルの読み込みにHuggingfaceへのログインが必要

!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# CFG

In [None]:
TRAIN_CSV = '/content/drive/Shared drives/BLM_AI開発室共有/05_Kaggle/共有/train.csv'    # 訓練データパス
RESUME_TRAIN = False      # 再訓練用フラグ（colabの24時間制限に対応）
model_path = "meta-llama/Meta-Llama-3.1-8B-Instruct"     # base modelの読み込み用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')    # GPU利用
MAX_LENGTH = 2048     # 最大トークンサイズ
target_columns = ['winner_model_a', 'winner_model_b', 'winner_tie']
columns_to_vectorize = ["prompt", "response_a", "response_b"]

train = pd.read_csv(TRAIN_CSV)
# train = train.head(100)

# 目的変数の作成

train['label'] = train[target_columns].idxmax(axis=1)
label_encoder = LabelEncoder()
train['label'] = label_encoder.fit_transform(train['label'])
train = train[columns_to_vectorize + ['label']]

# Tokenize

In [None]:
# tokenizerの呼び出し、設定調整
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.add_eos_token = True
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

# 目的変数のtoken_idを取得
LABEL_IDS = [tokenizer(i, add_special_tokens=False)["input_ids"][0] for i in ['a', 'b', 'tie']]

# トークン長に合わせて入力データを調整する関数
def tokenize(example, tokenizer):
    prompt = tokenizer('<prompt>: ' + " ".join(eval(example['prompt'], {"null": ""})), add_special_tokens=False)["input_ids"]
    response_a = tokenizer('\n\n<response_a>: ' + " ".join(eval(example['response_a'], {"null": ""})), add_special_tokens=False)["input_ids"]
    response_b = tokenizer('\n\n<response_b>: ' + " ".join(eval(example['response_b'], {"null": ""})), add_special_tokens=False)["input_ids"]

    extra_prompt = tokenizer('\n\n---------\nWhich is the better response for the prompt ? a or b or tie ?\n\nAnswer: ', add_special_tokens=False)["input_ids"]

    if len(prompt+response_a+response_b) > MAX_LENGTH - len(extra_prompt):
        p_max = (MAX_LENGTH - len(extra_prompt)) // 2
        remaining_length = MAX_LENGTH - min(p_max,len(prompt))

        a_max = int(remaining_length * (len(response_a) / (len(response_a) + len(response_b))))
        b_max = remaining_length - min(a_max, len(response_a))
        prompt = tokenizer('<prompt>: ' + eval(example['prompt'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:p_max]
        response_a = tokenizer('\n\n<response_a>: ' + eval(example['response_a'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:a_max]
        response_b = tokenizer('\n\n<response_b>: ' + eval(example['response_b'], {"null": ""})[-1], add_special_tokens=False)["input_ids"][:b_max]

    label_token_id = LABEL_IDS[int(example['label'])]
    input_ids = [tokenizer.bos_token_id] + prompt + response_a + response_b + extra_prompt + [label_token_id] + [tokenizer.eos_token_id]
    attention_mask = len(input_ids)*[1]
    labels = [-100]* len([tokenizer.bos_token_id] + prompt + response_a + response_b + extra_prompt) + [label_token_id] + [tokenizer.eos_token_id]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }


In [None]:
# データフレームを入力用に変換
def load_data(df, tokenizer):
    raw_datasets = Dataset.from_pandas(df)
    tokenized_datasets = raw_datasets.map(
        tokenize,
        remove_columns=raw_datasets.column_names,
        fn_kwargs={'tokenizer': tokenizer}
    )
    return tokenized_datasets

# 損失計算
def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(axis=-1)
    label_tokens_ids = np.array(LABEL_IDS)
    index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
    labels = labels[np.isin(labels, label_tokens_ids)]
    labels = np.array([index_mapping[label.item()] for label in labels])
    acc = accuracy_score(labels, preds)
    probs = softmax(logits, axis=-1)
    log_loss_ = log_loss(labels, probs)
    return {'accuracy': acc, 'log_loss': log_loss_}

n_splits = 5
fold_idx = 0
ds = load_data(train, tokenizer)

# folds = [
#     (
#         [i for i in range(len(ds)) if i % n_splits != fold_idx],
#         [i for i in range(len(ds)) if i % n_splits == fold_idx]
#     )
#     for fold_idx in range(n_splits)
# ]

# train_idx, eval_idx = folds[fold_idx]

Map:   0%|          | 0/57477 [00:00<?, ? examples/s]

# Model

In [None]:
class Llama3ForSFT(LlamaPreTrainedModel):
    _tied_weights_keys = ["lm_head.weight"]
    def __init__(self, config):
        super().__init__(config)
        self.model = LlamaModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
        self.post_init()

    def forward(
        self,
        input_ids= None,
        attention_mask= None,
        position_ids = None,
        past_key_values= None,
        inputs_embeds= None,
        labels= None,
        use_cache= None,
        output_attentions= None,
        output_hidden_states = None,
        return_dict= None,
        cache_position = None,
    ):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            cache_position=cache_position,
        )
        hidden_states = outputs[0]
        if self.config.pretraining_tp > 1:
            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
            logits = torch.cat(logits, dim=-1)
        else:
            logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = nn.CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)

            label_tokens_ids = torch.tensor(LABEL_IDS,device=shift_labels.device)
            index_mapping = {value.item(): idx for idx, value in enumerate(label_tokens_ids)}
            true_labels = shift_labels[torch.isin(shift_labels, label_tokens_ids)]
            true_labels = torch.tensor([index_mapping[label.item()] for label in true_labels], device=true_labels.device)
            true_logits = shift_logits[torch.isin(shift_labels, label_tokens_ids)][:,label_tokens_ids]
            loss = loss_fct(true_logits, true_labels)

        return CausalLMOutputWithPast(
            loss=loss,
            logits=true_logits,
        )

# 8bit量子化・Peft設定  

    量子化・・・モデルのparameter軽量化  
    Peft・・・Loraを追加し、訓練時のパラメーター更新を効率化（Full-Finetuningよりかは精度下がるよう）  

In [None]:
if not RESUME_TRAIN:
    bnb_config =  BitsAndBytesConfig(
        load_in_8bit=True,
    )

    peft_config = LoraConfig(
        r=32,
        lora_alpha=64,
        lora_dropout=0.05,
        bias='none',
        inference_mode=False,
        task_type=TaskType.CAUSAL_LM,
        target_modules=['o_proj', "v_proj",
                    "q_proj", "k_proj",
                    'down_proj', 'gate_proj', 'up_proj'],
    )

    model = Llama3ForSFT.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        torch_dtype=torch.float32,
    )
    model.config.use_cache = False
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)
    print(model)
    model.print_trainable_parameters()

# Model argumentation  

    訓練時のメモリ効率化、学習率、保存するモデルの設定

In [None]:
args = TrainingArguments(
    output_dir='/content/drive/Shared drives/BLM_AI開発室共有/05_Kaggle/00_GoogleColab/高橋/Llama3.1-8b-ver6',
    overwrite_output_dir = True,
#     evaluation_strategy = "epoch",
    save_strategy = "steps",
    save_steps=100,
    save_total_limit=1,
    logging_strategy="steps",
    logging_steps=10,
    warmup_steps=20,
    optim="adamw_8bit",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
#     per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    fp16=True,
    metric_for_best_model="log_loss",
    greater_is_better = False,
    report_to="none",
)


# 二回目以降の訓練

In [None]:
if RESUME_TRAIN:
    checkpoint_dir = '/content/drive/Shared drives/BLM_AI開発室共有/05_Kaggle/00_GoogleColab/高橋/Llama3.1-8b-ver6/checkpoint-7000'

    bnb_config =  BitsAndBytesConfig(
        load_in_8bit=True,
    )

    peft_config = LoraConfig(
        r=32,
        lora_alpha=64,
        lora_dropout=0.05,
        bias='none',
        inference_mode=False,
        task_type=TaskType.CAUSAL_LM,
        target_modules=['o_proj', "v_proj",
                    "q_proj", "k_proj",
                    'down_proj', 'gate_proj', 'up_proj'],
    )

    model = Llama3ForSFT.from_pretrained(
        checkpoint_dir,
        quantization_config=bnb_config,
        torch_dtype=torch.float32,
    )
    model.config.use_cache = False
    model.config.pad_token_id = tokenizer.pad_token_id
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)

    # model = PeftModel.from_pretrained(model, checkpoint_dir)
    model.train()
    model.to(device)

    trainer = Trainer(
        args=args,
        model=model,
    #     train_dataset=ds.select(train_idx),
    #     eval_dataset=ds.select(eval_idx),
        train_dataset = ds,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )

    train_result = trainer.train(resume_from_checkpoint=False)

    trainer.save_model('/content/drive/Shared drives/BLM_AI開発室共有/05_Kaggle//高橋/Llama3.1-8b-0729')

# 初回訓練  

    訓練時のOOMが解決できないので、妥協策で検証なしでの訓練を実行  
    ※スモールデータで検証済み

In [None]:
if not RESUME_TRAIN:
    trainer = Trainer(
        args=args,
        model=model,
    #     train_dataset=ds.select(train_idx),
    #     eval_dataset=ds.select(eval_idx),
        train_dataset = ds,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )
    trainer.train(resume_from_checkpoint=False)
    trainer.save_model('/content/drive/Shared drives/BLM_AI開発室共有/05_Kaggle/00_GoogleColab/高橋/Llama3.1-base2-0731')