# トークンを特徴量にした機械学習による文書分類

In [3]:
!pip install transformers[ja,torch] datasets matplotlib japanize-matplotlib

Collecting transformers[ja,torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
Collecting japanize-matplotlib
  Downloading japanize-matplotlib-1.1.3.tar.gz (4.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.1/4.1 MB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[ja,torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[ja,torch

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier  # Random Forestのクラス
from transformers import AutoTokenizer
from typing import Tuple
import pandas as pd
from datasets import load_dataset, Dataset, ClassLabel
from sklearn.metrics import precision_score, recall_score


# データセットの読み込み
# train_dataset = load_dataset("llm-book/wrime-sentiment", split="train")
# valid_dataset = load_dataset("llm-book/wrime-sentiment", split="validation")
# train_dataset = load_dataset("shunk031/JGLUE", name="MARC-ja",split="train")
# valid_dataset = load_dataset("shunk031/JGLUE", name="MARC-ja",split="validation")
# CSVファイルからデータを読み込む
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('validation.csv')
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

# データセットを結合
all_sentences = train_dataset['sentence'] + valid_dataset['sentence']
all_labels = train_dataset['label'] + valid_dataset['label']

# トークナイズと特徴量化
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v3")
tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in all_sentences]
tokenized_sentences = [' '.join(tokens) for tokens in tokenized_sentences]

# 特徴量の作成
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
X = vectorizer.fit_transform(tokenized_sentences)

# トレーニングデータとバリデーションデータの分割
num_train_samples = len(train_dataset)
X_train = X[:num_train_samples]
X_valid = X[num_train_samples:]
train_labels = all_labels[:num_train_samples]
valid_labels = all_labels[num_train_samples:]

# Random Forestモデルの訓練
clf = RandomForestClassifier()
clf.fit(X_train, train_labels)

# バリデーションデータで予測
valid_predictions = clf.predict_proba(X_valid)

# 正解率の計算

def compute_metrics(eval_pred: Tuple[np.ndarray, np.ndarray]) -> dict[str, float]:
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate Precision
    precision = precision_score(labels, predictions, average='macro')  # または average='micro' など適切なオプションを選択してください

    # Calculate Recall
    recall = recall_score(labels, predictions, average='macro')  # または average='micro' など適切なオプションを選択してください

    # Calculate Accuracy
    accuracy = (predictions == labels).mean()

    return {"accuracy": accuracy, "precision": precision, "recall": recall}

metrics_dict = compute_metrics((valid_predictions, valid_labels))
accuracy = metrics_dict["accuracy"]
precision = metrics_dict["precision"]
recall = metrics_dict["recall"]

print("Validation Accuracy:", accuracy)
print("Validation Precision:", precision)
print("Validation Recall:", recall)

Downloading (…)okenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

Validation Accuracy: 0.5898021308980214
Validation Precision: 0.5899037837637419
Validation Recall: 0.5898860785008793


### （参考） XGBoost

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from transformers import AutoTokenizer
from datasets import load_dataset
from typing import Tuple
import pandas as pd
from datasets import Dataset, DatasetDict

def compute_accuracy(
    eval_pred: Tuple[np.ndarray, np.ndarray]
) -> dict[str, float]:
    """予測ラベルと正解ラベルから正解率を計算"""
    predictions, labels = eval_pred
    # predictionsは各ラベルについてのスコア
    # 最もスコアの高いインデックスを予測ラベルとする
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

# データセットの読み込み
# train_dataset = load_dataset("llm-book/wrime-sentiment", split="train")
# valid_dataset = load_dataset("llm-book/wrime-sentiment", split="validation")
# train_dataset = load_dataset("shunk031/JGLUE", name="MARC-ja",split="train")
# valid_dataset = load_dataset("shunk031/JGLUE", name="MARC-ja",split="validation")

# # ローカルファイルから読み込み
# CSVファイルからデータを読み込む
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('validation.csv')
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

# train_dataset = load_dataset('csv', data_files='train.csv', header=0)
# valid_dataset =load_dataset('csv', data_files='validation.csv',header=0)



# データセットを結合
all_sentences = train_dataset['sentence'] + valid_dataset['sentence']
all_labels = train_dataset['label'] + valid_dataset['label']

# Tokenizerのロード
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v3")

# トークナイズと特徴量化
tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in all_sentences]
tokenized_sentences = [' '.join(tokens) for tokens in tokenized_sentences]

# 特徴量の作成
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
X = vectorizer.fit_transform(tokenized_sentences)

# トレーニングデータとバリデーションデータの分割
num_train_samples = len(train_dataset)
X_train = X[:num_train_samples]
X_valid = X[num_train_samples:]
train_labels = all_labels[:num_train_samples]
valid_labels = all_labels[num_train_samples:]

# XGBoostモデルの訓練
clf = XGBClassifier()
clf.fit(X_train, train_labels)

# バリデーションデータで予測
valid_predictions = clf.predict_proba(X_valid)

# 正解率の計算
accuracy_dict = compute_accuracy((valid_predictions, valid_labels))
accuracy = accuracy_dict["accuracy"]
print("Validation Accuracy:", accuracy)


Validation Accuracy: 0.5928462709284628


###  （参考）  LightGBM

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from lightgbm import LGBMClassifier  # LightGBMのクラス
from transformers import AutoTokenizer
from datasets import load_dataset
from typing import Tuple

def compute_accuracy(
    eval_pred: Tuple[np.ndarray, np.ndarray]
) -> dict[str, float]:
    """予測ラベルと正解ラベルから正解率を計算"""
    predictions, labels = eval_pred
    # predictionsは各ラベルについてのスコア
    # 最もスコアの高いインデックスを予測ラベルとする
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}

# データセットの読み込み
# train_dataset = load_dataset("llm-book/wrime-sentiment", split="train")
# valid_dataset = load_dataset("llm-book/wrime-sentiment", split="validation")
# train_dataset = load_dataset("shunk031/JGLUE", name="MARC-ja",split="train")
# valid_dataset = load_dataset("shunk031/JGLUE", name="MARC-ja",split="validation")

# データを読み込む
# CSVファイルからデータを読み込む
train_df = pd.read_csv('train.csv')
valid_df = pd.read_csv('validation.csv')
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)


# データセットを結合
all_sentences = train_dataset['sentence'] + valid_dataset['sentence']
all_labels = train_dataset['label'] + valid_dataset['label']

# Tokenizerのロード
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v3")

# トークナイズと特徴量化
tokenized_sentences = [tokenizer.tokenize(sentence) for sentence in all_sentences]
tokenized_sentences = [' '.join(tokens) for tokens in tokenized_sentences]

# 特徴量の作成
vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
X = vectorizer.fit_transform(tokenized_sentences)

# トレーニングデータとバリデーションデータの分割
num_train_samples = len(train_dataset)
X_train = X[:num_train_samples]
X_valid = X[num_train_samples:]
train_labels = all_labels[:num_train_samples]
valid_labels = all_labels[num_train_samples:]

# LightGBMモデルの訓練
clf = LGBMClassifier()
clf.fit(X_train, train_labels)

# バリデーションデータで予測
valid_predictions = clf.predict_proba(X_valid)

# 正解率の計算
accuracy_dict = compute_accuracy((valid_predictions, valid_labels))
accuracy = accuracy_dict["accuracy"]
print("Validation Accuracy:", accuracy)


[LightGBM] [Info] Number of positive: 743, number of negative: 755
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7526
[LightGBM] [Info] Number of data points in the train set: 1498, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.495995 -> initscore=-0.016022
[LightGBM] [Info] Start training from score -0.016022
Validation Accuracy: 0.5852359208523592
