In [1]:
import torch
# GPUが使用可能か判断
if torch.cuda.is_available():
    print('gpu is available')
else:
    raise Exception('gpu is NOT available')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

gpu is available


device(type='cuda')

In [2]:
# !pip install transformers[torch] datasets
# !pip install fugashi
# !pip install ipadic
# !pip install sentencepiece

In [3]:
# !pip install git+https://github.com/huggingface/accelerate
# !pip install --upgrade transformers

In [4]:
# !pip install optuna

In [2]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import pandas as pd
import torch
import random

In [3]:
from transformers.trainer_utils import set_seed

# 乱数シードを42に固定
set_seed(42)

In [4]:
from pprint import pprint
from datasets import load_dataset

# Hugging Face Hub上のllm-book/wrime-sentimentのリポジトリから
# データを読み込む
train_dataset = load_dataset("llm-book/wrime-sentiment", split="train", remove_neutral=False)
valid_dataset = load_dataset("llm-book/wrime-sentiment", split="validation", remove_neutral=False)
# pprintで見やすく表示する
pprint(train_dataset)
pprint(valid_dataset)

Dataset({
    features: ['sentence', 'label', 'user_id', 'datetime'],
    num_rows: 30000
})
Dataset({
    features: ['sentence', 'label', 'user_id', 'datetime'],
    num_rows: 2500
})


In [5]:
# 実験のためデータセットを縮小したい場合はコチラを有効化
train_dataset = train_dataset.select(
    random.sample(range(train_dataset.num_rows), k=500))
valid_dataset = valid_dataset.select(
    random.sample(range(valid_dataset.num_rows), k=500))
pprint(train_dataset)
pprint(valid_dataset)

Dataset({
    features: ['sentence', 'label', 'user_id', 'datetime'],
    num_rows: 500
})
Dataset({
    features: ['sentence', 'label', 'user_id', 'datetime'],
    num_rows: 500
})


In [6]:
# トークナイザのロード
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
# トークナイズ処理
def preprocess_text(batch):
    encoded_batch = tokenizer(batch['sentence'], max_length=512)
    encoded_batch['labels'] = batch['label']
    return encoded_batch

In [8]:
encoded_train_dataset = train_dataset.map(
    preprocess_text,
    remove_columns=train_dataset.column_names,
)
encoded_valid_dataset = valid_dataset.map(
    preprocess_text,
    remove_columns=valid_dataset.column_names,
)

In [9]:
# ミニバッチ構築
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
def optuna_hp_space(trial):
    return {
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["constant", "linear", "cosine"]),
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128, 256]),
        "weight_decay": trial.suggest_float("weight_decay", 1e-6, 1e-1, log=True),
    }

In [11]:
# モデルの準備
from transformers import AutoModelForSequenceClassification

def model_init(trial):
    class_label = train_dataset.features["label"]
    label2id = {label: id for id, label in enumerate(class_label.names)}
    id2label = {id: label for id, label in enumerate(class_label.names)}
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=class_label.num_classes,
        label2id=label2id,  # ラベル名からIDへの対応を指定
        id2label=id2label,  # IDからラベル名への対応を指定
    )
    return model

In [12]:
# 訓練の実行
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="output_wrime",  # 結果の保存フォルダ
    # per_device_train_batch_size=32,  # 訓練時のバッチサイズ
    # per_device_eval_batch_size=32,  # 評価時のバッチサイズ
    # learning_rate=2e-5,  # 学習率
    # lr_scheduler_type="constant",  # 学習率スケジューラの種類
    warmup_ratio=0.1,  # 学習率のウォームアップの長さを指定
    num_train_epochs=3,  # エポック数
    save_strategy="epoch",  # チェックポイントの保存タイミング
    logging_strategy="epoch",  # ロギングのタイミング
    evaluation_strategy="epoch",  # 検証セットによる評価のタイミング
    load_best_model_at_end=True,  # 訓練後に開発セットで最良のモデルをロード
    metric_for_best_model="accuracy",  # 最良のモデルを決定する評価指標
    fp16=True,  # 自動混合精度演算の有効化
)

In [13]:
# メトリクスの定義
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [14]:
from transformers import Trainer

trainer = Trainer(
    model=None,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_valid_dataset,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    model_init=model_init,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
def compute_objective(metrics):
    return metrics["eval_f1"]

In [16]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=2,
    compute_objective=compute_objective,
)

[I 2023-09-26 22:46:51,762] A new study created in memory with name: no-name-1d77028b-3f95-45d0-b438-571feaa2d313
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0882,1.071015,0.41,0.329907
2,0.9569,1.011538,0.486,0.459388
3,0.8172,0.977785,0.534,0.506148


[I 2023-09-26 22:46:59,916] Trial 0 finished with value: 0.5061482764976959 and parameters: {'lr_scheduler_type': 'cosine', 'learning_rate': 3.1180320531879484e-05, 'per_device_train_batch_size': 32, 'weight_decay': 0.007762469763525453}. Best is trial 0 with value: 0.5061482764976959.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.098,1.09108,0.392,0.315927
2,1.056,1.082356,0.408,0.338301
3,1.0324,1.0776,0.406,0.345394


[I 2023-09-26 22:48:15,983] Trial 1 finished with value: 0.3453943678058103 and parameters: {'lr_scheduler_type': 'linear', 'learning_rate': 1.0575280860102293e-05, 'per_device_train_batch_size': 32, 'weight_decay': 0.006507967965682745}. Best is trial 0 with value: 0.5061482764976959.


In [19]:
print(best_trial)

BestRun(run_id='0', objective=0.5061482764976959, hyperparameters={'lr_scheduler_type': 'cosine', 'learning_rate': 3.1180320531879484e-05, 'per_device_train_batch_size': 32, 'weight_decay': 0.007762469763525453}, run_summary=None)


In [21]:
# # epoch: 100, early stopping
# # 訓練の実行
# from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="output_wrime",  # 結果の保存フォルダ
#     per_device_train_batch_size=32,  # 訓練時のバッチサイズ
#     per_device_eval_batch_size=32,  # 評価時のバッチサイズ
#     learning_rate=2e-5,  # 学習率
#     lr_scheduler_type="constant",  # 学習率スケジューラの種類
#     warmup_ratio=0.1,  # 学習率のウォームアップの長さを指定
#     num_train_epochs=100,  # エポック数
#     save_strategy="epoch",  # チェックポイントの保存タイミング
#     logging_strategy="epoch",  # ロギングのタイミング
#     evaluation_strategy="epoch",  # 検証セットによる評価のタイミング
#     load_best_model_at_end=True,  # 訓練後に開発セットで最良のモデルをロード
#     metric_for_best_model="eval_loss",  # 最良のモデルを決定する評価指標
#     greater_is_better=False,            # eval_lossは小さいほどよい
#     fp16=True,  # 自動混合精度演算の有効化
# )

In [22]:
# from transformers import Trainer
# from transformers import EarlyStoppingCallback

# trainer = Trainer(
#     model=model,
#     train_dataset=encoded_train_dataset,
#     eval_dataset=encoded_valid_dataset,
#     data_collator=data_collator,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
# )
# trainer.train()