In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# basic settings
# ideal model should be "rinna/japanese-gpt-neox-3.6b-instruction-sft"
# MODEL_NAME= "rinna/japanese-gpt-neox-3.6b-instruction-sft"
LORA_MODEL_PATH = "models/rinna-lora-finetuned"
MODEL_NAME = "rinna/japanese-gpt2-medium"

# get tokenizer (edit it and save it) and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False)
tokenizer.add_special_tokens({"additional_special_tokens": ["###指示:", "###ユーザー:", "###キャラ:"]})
tokenizer.save_pretrained("model/rinna-finetuned")

# load the base model
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
base_model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/284 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/786k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/7.37G [00:00<?, ?B/s]

# Task 4: Train GPT based on dataset1 and dataset 2

## Step 1: Change the data format

In [None]:
import pandas as pd
from datasets import Dataset

In [None]:
# create a function to change the data format
def prepare_dataset(df, tone_col=None):
    """
    Conbine prev_line and current (and tone if given), save to HuggingFace Dataset (used for rinna)
    
    Args:
        df: dataframe with previous line, current line (and tone)
        tone_col: name of the column with tone
        
    Return:
        dataset: dataset of the given df
    """
    # change the format
    if tone_col is None:
        df_formatted = pd.DataFrame({
            "instruction": [f"###指示: "  for _ in range(len(df))],
            "input": [f"###ユーザー: {row}" for row in df["prev_line"]],
            "output": [f"###キャラ: {row}" for row in df["current_line"]]
        })
    else:
        df_formatted = pd.DataFrame({
            "instruction": [f"###指示: {row}の口調で返事をください。" for row in df[tone_col]],
            "input": [f"###ユーザー: {row}" for row in df["prev_line"]],
            "output": [f"###キャラ: {row}" for row in df["chigiri_line"]]
        })

    # create HuggingFace Dataset
    dataset = Dataset.from_pandas(df_formatted)

    return dataset

In [None]:
# dataset 2: full Blue Lock dataset
df_bluelock = pd.read_csv("data/bluelock_paired.csv")
bl_dataset = prepare_dataset(df_bluelock)
bl_dataset[0]

{'instruction': '###指示: ',
 'input': '###ユーザー: 勝ったら全国！',
 'output': '###千切: いけ 潔！'}

In [None]:
def tokenize_function(data, max_length=512):
    """
    Tokenize a sentence.
    
    Args:
        data: a row from dataset
        max_length: maximum length of returned result
        
    Return:
        tokenized: tokenized sentence
    """
    # reformat the prompts into a full sentnece
    prompts = [
        instruction + "<NL>" + user + "<NL>" + output + tokenizer.eos_token
        for instruction, user, output in zip(data["instruction"], data["input"], data["output"])
    ]

    # tokenize using the tokenizer
    tokenized = tokenizer(prompts, truncation=True, padding="max_length", max_length=max_length)
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

## Step 2: Load in the model and change setting

In [None]:
from transformers import TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType, PeftModel

In [None]:
def load_trained_LoRA(base_model, lora_path):
    """
    Load in the trained LoRA layers to the base model.
    
    Args:
       base_model: base gpt model. Should be the same one as LoRA was trained
       lora_path: LORA_MODEL_PATH
       
    Return:
        combined_full_model
    """
    return PeftModel.from_pretrained(base_model, lora_path, is_trainable=True)

def train_model(base_model, train_data, adapter_path=None, r=8, 
                lora_alpha=16, batch_size=4, learning_rate=2e-4, epoch=1):
    """
    Use LoRA technique to train the gpt base model. Save the trained model to LORA_MODEL_PATH
    
    Args:
        base_model: the base pre-trained model
        train_data (dataset): un-tokenized dataset to train
        adapter_path (str): give the adapter_path if there existed previously trained LoRA layers
        r: rank for LoRA setting
        lora_alpha: scaling factor for LoRA setting
        batch_size: batch size. (system could crush if batch_size>4)
        learning_rate: learning rate
        epoch: # of iteration
    """
    # LoRA setting
    if adapter_path is None:
        # create a new LoRa
        lora_config = LoraConfig(
            r=r,
            lora_alpha=lora_alpha,
            target_modules=["c_attn", "c_proj"],
            lora_dropout=0.05,
            task_type=TaskType.CAUSAL_LM
        )

        # add LoRA wrapper to the base model
        model = get_peft_model(base_model, lora_config)
    else:
        # use old LoRA
        model = load_trained_LoRA(base_model, adapter_path)

    # tokenize dataset
    tokenized_dataset = train_data.map(tokenize_function, batched=True)

    # model setting
    training_args = TrainingArguments(
        # where to save the model
        output_dir="./checkpoints",
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=8,
        num_train_epochs=epoch,
        logging_steps=10,
        learning_rate=learning_rate,
        save_steps=200,
        save_total_limit=2,
        report_to="none",
        no_cuda=False,
    )

    # train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
    )
    trainer.train()
    print("Finish Training.")

    # save the model
    save_path = adapter_path if adapter_path else LORA_MODEL_PATH
    model.save_pretrained(save_path)

## Step 3: Train with full Blue Lock data

In [None]:
# train
train_model(base_model, bl_dataset)

trainable params: 2,162,688 || all params: 338,293,760 || trainable%: 0.6393




Map:   0%|          | 0/9425 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## Step 4: Train with Character-only data

In [None]:
# read in data, change to dataset
df_character = pd.read_csv("data/chigiri_train_w_tone.csv")
character_dataset = prepare_dataset(df_character, tone_col="tone")
character_dataset[0]

Saving chigiri_train_w_tone_1.csv to chigiri_train_w_tone_1 (4).csv


{'instruction': '###指示: 冷静に説明・論理的に助言・分析するの口調で返事をください。',
 'input': '###ユーザー: 一次セレクションはお前らのいる伍号棟 55名全５チームによる、総当たりリーグ戦 上位２チームのみが二次セレクションへと勝ち上がるサバイバルマッチだ',
 'output': '###キャラ: じゃあ ここにいるチームＺ 11人が１つのチームってこと？ 全員フォワードなのに？'}

In [None]:
# train the model
char_model = train_model(base_model, character_dataset, adapter_path=LORA_MODEL_PATH, learning_rate=5e-4, epoch=5)

Map:   0%|          | 0/217 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,3.1308
20,3.3432
30,3.0371




Finish Training.




# Citation

@misc{rinna-japanese-gpt2-medium,
    title = {rinna/japanese-gpt2-medium},
    author = {Zhao, Tianyu and Sawada, Kei},
    url = {https://huggingface.co/rinna/japanese-gpt2-medium}
}

@inproceedings{sawada2024release,
    title = {Release of Pre-Trained Models for the {J}apanese Language},
    author = {Sawada, Kei and Zhao, Tianyu and Shing, Makoto and Mitsui, Kentaro and Kaga, Akio and Hono, Yukiya and Wakatsuki, Toshiaki and Mitsuda, Koh},
    booktitle = {Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)},
    month = {5},
    year = {2024},
    pages = {13898--13905},
    url = {https://aclanthology.org/2024.lrec-main.1213},
    note = {\url{https://arxiv.org/abs/2404.01657}}
}
