85

In [3]:
!pip install pandas transformers



In [4]:
import pandas as pd
#テクストのtokenizerを取得するクラス、from_pretrained("model_name")メソッドでモデルに対応したtokenizerを取得可能
from transformers import AutoTokenizer

"""
重要
tokenizer.tokenize(text)ではサブワード単位の文字列配列に変換
ex)sentenceとsubwordの分割例
    hide new secretions from the parental units
    ['hide', 'Ġnew', 'Ġsecret', 'ions', 'Ġfrom', 'Ġthe', 'Ġparental', 'Ġunits', 'Ġ']
tokenizer(text)はモデルに入力する整数IDを返す
ex)sentenceと整数IDの分割例
    hide new secretions from the parental units
    'input_ids': [50281, 21179, 747, 4279, 621, 432, 253, 17087, 5085, 209, 50282]
    50281は[CLS];文全体の意味を表す特殊トークン,50282は[SEP];文の区切りを表す特殊トークン
"""

# データの読み込み
def load_data(file_path):
    #header=0で最初の行を列名として扱う
    df = pd.read_csv(file_path, sep="\t", header=0)
    return df["sentence"].tolist(), df["label"].tolist()


# テキストをトークン列に変換
def tokenize_texts(texts):
    tokenized_texts = []
    for text in texts:
        # トークン化（特殊トークンを追加）
        tokens = tokenizer.tokenize(text)
        tokenized_texts.append(tokens)
    return tokenized_texts


# モデルとトークナイザーの読み込み
model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

# データファイルのパス
train_path = "train.tsv"
dev_path = "dev.tsv"

# 訓練データと開発データの読み込み
train_texts, train_labels = load_data(train_path)
dev_texts, dev_labels = load_data(dev_path)

# トークン化の実行
train_tokenized = tokenize_texts(train_texts)
dev_tokenized = tokenize_texts(dev_texts)

In [5]:
print(train_tokenized)

Output hidden; open in https://colab.research.google.com to view.

86

In [6]:
!pip install torch



In [7]:
import torch

#最初の4事例の選択
sample_texts = train_texts[:4]
sample_labels = train_labels[:4]
sample_tokenized = train_tokenized[:4]

#パディングとトークンIDへの変換
encoded = tokenizer(sample_texts, padding=True)

#結果の表示
print("元のテキスト:")
for text in sample_texts:
    print(f"-{text}")

print("\nトークン列:")
for tokens in sample_tokenized:
    print(f"-{tokens}")

print("\nパディング後のトークンID:")
print(encoded["input_ids"])

print("\nアテンションマスク:")
print(encoded["attention_mask"])

print("\nラベル:")
print(torch.tensor(sample_labels))


元のテキスト:
-hide new secretions from the parental units 
-contains no wit , only labored gags 
-that loves its characters and communicates something rather beautiful about human nature 
-remains utterly satisfied to remain the same throughout 

トークン列:
-['hide', 'Ġnew', 'Ġsecret', 'ions', 'Ġfrom', 'Ġthe', 'Ġparental', 'Ġunits', 'Ġ']
-['contains', 'Ġno', 'Ġwit', 'Ġ,', 'Ġonly', 'Ġlab', 'ored', 'Ġg', 'ags', 'Ġ']
-['that', 'Ġloves', 'Ġits', 'Ġcharacters', 'Ġand', 'Ġcommunic', 'ates', 'Ġsomething', 'Ġrather', 'Ġbeautiful', 'Ġabout', 'Ġhuman', 'Ġnature', 'Ġ']
-['rem', 'ains', 'Ġutterly', 'Ġsatisfied', 'Ġto', 'Ġremain', 'Ġthe', 'Ġsame', 'Ġthroughout', 'Ġ']

パディング後のトークンID:
[[50281, 21179, 747, 4279, 621, 432, 253, 17087, 5085, 209, 50282, 50283, 50283, 50283, 50283, 50283], [50281, 24634, 642, 19311, 1157, 760, 5188, 2149, 305, 3544, 209, 50282, 50283, 50283, 50283, 50283], [50281, 3529, 14528, 697, 5810, 285, 3461, 684, 1633, 2581, 5389, 670, 1966, 3753, 209, 50282], [50281, 2013, 1550, 23228, 10

87

In [8]:
!pip install datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [10]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from datasets import Dataset
import pandas as pd
import evaluate


def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return metric.compute(predictions=predictions, references=labels)


def main():
    # モデルとトークナイザーの読み込み
    model_name = "llm-jp/llm-jp-3-150m-instruct3"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # パディングトークンの設定
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    # モデルのパディングトークンIDも設定
    model.config.pad_token_id = tokenizer.pad_token_id

    # データの読み込み
    train_path = "train.tsv"
    dev_path = "dev.tsv"

    train_df = pd.read_csv(train_path, sep="\t", header=0)
    dev_df = pd.read_csv(dev_path, sep="\t", header=0)

    # データセットの作成
    def tokenize_function(examples):
        return tokenizer(
            examples["sentence"],
            padding="max_length",
            truncation=True,
            max_length=512,
        )

    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(dev_df)

    train_dataset = train_dataset.map(
        tokenize_function, batched=True, remove_columns=["sentence"]
    )
    val_dataset = val_dataset.map(
        tokenize_function, batched=True, remove_columns=["sentence"]
    )

    # トレーニング引数の設定
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        logging_dir="./logs",
        report_to="none",
    )

    # トレーナーの作成
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    # 学習の実行
    trainer.train()

    # 最終評価
    eval_results = trainer.evaluate()
    print(f"最終評価結果: {eval_results}")


if __name__ == "__main__":
    main()

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at llm-jp/llm-jp-3-150m-instruct3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Step,Training Loss
500,0.6278
1000,0.3429
1500,0.3039
2000,0.2774
2500,0.2206
3000,0.1903
3500,0.1902
4000,0.1772
4500,0.1398
5000,0.122


Downloading builder script: 0.00B [00:00, ?B/s]

最終評価結果: {'eval_loss': 0.6531379818916321, 'eval_accuracy': 0.8291284403669725, 'eval_runtime': 18.4023, 'eval_samples_per_second': 47.385, 'eval_steps_per_second': 1.522, 'epoch': 3.0}


88

In [14]:
import os
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

class PredictionDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {"input_ids": enc["input_ids"].squeeze(),
                "attention_mask": enc["attention_mask"].squeeze()}

def main():
    base = "./results"
    ckpts = [d for d in os.listdir(base) if d.startswith("checkpoint-")]
    ckpt = os.path.join(base, sorted(ckpts, key=lambda s: int(s.split("-")[-1]))[-1])
    print("使用するモデル:", ckpt)

    # ★ 学習時に使った“元モデルID”のトークナイザを使う
    # 例）llm-jpで学習しているなら ↓ を使う
    tokenizer = AutoTokenizer.from_pretrained("llm-jp/llm-jp-3-150m-instruct3")
    # ModernBERTで学習したチェックポイントならこちらに切替：
    # tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token or tokenizer.unk_token

    model = AutoModelForSequenceClassification.from_pretrained(ckpt)

    sentences = [
        "The movie was full of incomprehensibilities.",
        "The movie was full of fun.",
        "The movie was full of excitement.",
        "The movie was full of crap.",
        "The movie was full of rubbish.",
    ]
    ds = PredictionDataset(sentences, tokenizer)

    args = TrainingArguments(output_dir="./results", per_device_eval_batch_size=32, report_to="none")
    trainer = Trainer(model=model, args=args)
    out = trainer.predict(ds)
    probs = torch.softmax(torch.tensor(out.predictions), dim=1)

    for s, p in zip(sentences, probs):
        print("文:", s)
        print(f"肯定的: {float(p[1]):.4f} / 否定的: {float(p[0]):.4f}")
        print("予測ラベル:", "肯定的" if p[1] > p[0] else "否定的")
        print("-"*40)

main()


使用するモデル: ./results/checkpoint-6315


文: The movie was full of incomprehensibilities.
肯定的: 0.0088 / 否定的: 0.9912
予測ラベル: 否定的
----------------------------------------
文: The movie was full of fun.
肯定的: 0.9940 / 否定的: 0.0060
予測ラベル: 肯定的
----------------------------------------
文: The movie was full of excitement.
肯定的: 0.9902 / 否定的: 0.0098
予測ラベル: 肯定的
----------------------------------------
文: The movie was full of crap.
肯定的: 0.0014 / 否定的: 0.9986
予測ラベル: 否定的
----------------------------------------
文: The movie was full of rubbish.
肯定的: 0.0455 / 否定的: 0.9545
予測ラベル: 否定的
----------------------------------------


89

In [20]:
import torch.nn as nn
from transformers import AutoModel

class MaxPoolingClassifier(nn.Module):
    def __init__(self, model_name, num_labels=2):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        # BERTの出力を取得
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # 各トークンの最大値プーリング
        pooled_output = outputs.last_hidden_state.max(dim=1)[0]

        # ドロップアウトと分類
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), labels.view(-1))

        return {"loss": loss, "logits": logits}

def load_data(file_path):
    df = pd.read_csv(file_path, sep="\t", header=0)
    return df["sentence"].tolist(), df["label"].tolist()

# データセットクラスの定義
class SSTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# 評価用の関数
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return metric.compute(predictions=predictions, references=labels)

def main():
    # モデルとトークナイザーの読み込み
    model_id = "answerdotai/ModernBERT-base"
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = MaxPoolingClassifier(model_id)

    # データの読み込み
    train_path = "train.tsv"
    dev_path = "dev.tsv"
    train_texts, train_labels = load_data(train_path)
    dev_texts, dev_labels = load_data(dev_path)

    # データセットの作成
    train_dataset = SSTDataset(train_texts, train_labels, tokenizer)
    dev_dataset = SSTDataset(dev_texts, dev_labels, tokenizer)

    # トレーニング引数の設定
    training_args = TrainingArguments(
        output_dir="./results_maxpool",
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        report_to="none",
    )

    # トレーナーの作成
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=dev_dataset,
        compute_metrics=compute_metrics,
    )

    # 訓練の実行
    trainer.train()

    # 最終的な評価
    eval_results = trainer.evaluate()
    print(f"最終的な評価結果: {eval_results}")

main()

"""
考察
モデルが、文章中の強調、否定などに関する語(exciding, sad)のある次元(152次元など)は絶対値が大きい。
Max =oolingは各次元において、文中のトークンで最も大きい活性を拾う。つまり、強調、肯定、否定の意味を持つトークンによって、
モデルが意味を拾うことができ、正解率が高まった。
"""

W1108 07:50:48.608000 783 torch/_inductor/utils.py:1436] [1/0_1] Not enough SMs to use max_autotune_gemm mode


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1714,0.204961,0.919725
2,0.1081,0.203412,0.934633
3,0.0514,0.213135,0.940367


最終的な評価結果: {'eval_loss': 0.21313463151454926, 'eval_accuracy': 0.9403669724770642, 'eval_runtime': 9.6629, 'eval_samples_per_second': 90.242, 'eval_steps_per_second': 2.898, 'epoch': 3.0}


In [None]:
"""
わかりやすかった、chatgptのｓ悦明

outputs.last_hidden_state =

[
  # 文1 (Batch 0):
  [
    [0.1, 0.3, 0.9, 0.2],   # Token 1
    [0.4, 0.1, 0.2, 0.8],   # Token 2
    [0.6, 0.4, 0.5, 0.1],   # Token 3
    [0.0, 0.7, 0.3, 0.4],   # Token 4
    [0.2, 0.1, 0.6, 0.9],   # Token 5
  ],

  # 文2 (Batch 1):
  [
    [0.9, 0.3, 0.4, 0.2],
    [0.5, 0.8, 0.1, 0.4],
    [0.7, 0.2, 0.6, 0.3],
    [0.1, 0.9, 0.7, 0.5],
    [0.2, 0.4, 0.3, 0.6],
  ]
]

torch.Size([2, 5, 4])

ここでMax-Poolingを行う(dim=1)と、

文1の場合
次元	値	最大値
第1成分	0.1, 0.4, 0.6, 0.0, 0.2	→ 0.6
第2成分	0.3, 0.1, 0.4, 0.7, 0.1	→ 0.7
第3成分	0.9, 0.2, 0.5, 0.3, 0.6	→ 0.9
第4成分	0.2, 0.8, 0.1, 0.4, 0.9	→ 0.9

Max-Pooling の結果の形状
pooled_output.shape = [B, H] = [2, 4]

"""