In [17]:
# 学習済みモデルとtalkenizerのimport
from transformers import BertForSequenceClassification, BertTokenizer
# データセットの作成
from datasets import load_dataset

# finetuning
import peft

# 訓練時のハイパーパラメータと学習の設定
from transformers import Trainer, TrainingArguments

# 評価用の関数の設定
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# 混同行列の作成
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# モデルの保存
import torch

In [15]:
import os

In [3]:
# 訓練済みモデルの読み込み
base_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# モデルをcudaに渡す
base_model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [18]:
# ファインチューニングの設定
peft_config = peft.LoraConfig(
    # タスクのタイプ
    task_type=peft.TaskType.SEQ_CLS,
    # 差分行列のランク
    r=100,
    lora_alpha=32,
    lora_dropout=0.1,
    # 推論に使う場合はinference_modeはTrue, 学習の時はFalse
    inference_mode=False,
    # 再学習の使う層
    target_modules="all-linear"
)

In [19]:
# peftをモデルに適用
sc_model = peft.get_peft_model(model=base_model, peft_config=peft_config)
# モデルをcudaに渡す
sc_model.cuda()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

In [20]:
# 学習するparamsの数
sc_model.print_trainable_parameters()

trainable params: 16,821,807 || all params: 126,383,454 || trainable%: 13.310133935728643


In [5]:
# tokenizerの読み込み
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [6]:
# csvファイルをデータセットに変換
train_data = load_dataset("csv", data_files="train_data.csv", column_names=["text", "label"], split="train")
valid_data = load_dataset("csv", data_files="valid_data.csv", column_names=["text", "label"], split="train")
test_data = load_dataset("csv", data_files="test_data.csv", column_names=["text", "label"], split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [7]:
# labelをstringからclasslabelに変更
train_data = train_data.class_encode_column("label")
valid_data = valid_data.class_encode_column("label")
test_data = test_data.class_encode_column("label")

Casting to class labels:   0%|          | 0/8208 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1026 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/1027 [00:00<?, ? examples/s]

In [9]:
# トークン化の関数
def tokenize(batch):
    # データセットのテキスト列を取得
    texts = batch["text"]
    if isinstance(texts, str):
        # テキストが単一の文字列の場合、tokenizerに直接渡す
        return tokenizer(texts, padding=True, truncation=True, max_length=128)
    elif isinstance(texts, (list, tuple)):
        # テキストがリストまたはタプルの場合、各要素を文字列に変換してtokenizerに渡す
        return tokenizer([str(text) for text in texts], padding=True, truncation=True, max_length=128)
    else:
        # 不正な入力形式の場合はエラーを発生させる
        raise ValueError("Invalid input format. Should be a string, a list/tuple of strings.")

In [10]:
# トークン化の関数を適用
train_data = train_data.map(tokenize, batched=True, batch_size=len(train_data))
valid_data = valid_data.map(tokenize, batched=True, batch_size=len(valid_data))
test_data = test_data.map(tokenize, batched=True, batch_size=len(test_data))

Map:   0%|          | 0/8208 [00:00<?, ? examples/s]

Map:   0%|          | 0/1026 [00:00<?, ? examples/s]

Map:   0%|          | 0/1027 [00:00<?, ? examples/s]

In [12]:
# pytorchに入力できるようにformatを変更
train_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])
valid_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [14]:
# 評価用の関数の設定
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(result):
    labels = result.label_ids
    preds = result.predictions.argmax(-1)
    # accuracy_scoreの算出
    acc = accuracy_score(labels, preds)
    # f1_scoreの算出
    f1 = f1_score(labels, preds, average="weighted")
    return {
        "accuracy score": acc,
        "f1 score": f1
    }

In [23]:
# ハイパーパラメータの設定
training_args = TrainingArguments(
    output_dir="outputs/results/",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="outputs/logs/"
)

In [24]:
# 学習の設定
trainer = Trainer(
    model=sc_model,
    args = training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=valid_data,
    tokenizer=tokenizer
)