<a href="https://colab.research.google.com/github/eel-eel-eel/ric1340/blob/main/ch05_03_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 環境構築

Google Driveをマウント
（データセットや学習済みモデルを格納する）

パスワードを求められた場合はリンクをクリックし、Googleアカウントにログインして表示された文字列を入力する。

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


データセットのダウンロード

In [None]:
!mkdir -p /content/drive/MyDrive/bert/5_2_wikipedia_ner

In [None]:
cd /content/drive/MyDrive/bert/5_2_wikipedia_ner

/content/drive/MyDrive/bert/5_2_wikipedia_ner


In [None]:
import os

if not os.path.exists("ner-wikipedia-dataset/"):
  !git clone https://github.com/stockmarkteam/ner-wikipedia-dataset.git

## データセットの前処理

In [None]:
import json

file_path = "ner-wikipedia-dataset/ner.json"

with open(file_path) as f:
  list_data = json.load(f)

BIO形式のデータセットに変換

In [None]:
!pip install transformers[ja]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[ja]
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.5 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 3.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 56.6 MB/s 
Collecting unidic>=1.0.2
  Downloading unidic-1.1.0.tar.gz (7.7 kB)
Collecting fugashi>=1.0
  Downloading fugashi-1.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014

In [None]:
from transformers import BertJapaneseTokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking",
    )

Downloading vocab.txt:   0%|          | 0.00/252k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/110 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/479 [00:00<?, ?B/s]

In [None]:
list_text = [data["text"] for data in list_data]
list_tokens = [tokenizer.tokenize(text) for text in list_text]
list_entities = [data["entities"] for data in list_data]

In [None]:
# cl-tohoku/bert-base-japanese-whole-word-maskingのモデルは最大512トークンまで対応しているが、
# 学習時のGPUメモリ消費を抑えるため256としている
n_token = 256

# encode後のトークンは、特殊トークン（{CLS], {SEP]など）や特殊文字（##）が挿入されることに注意
# spanで表されているラベルと文字数ベースで位置ずれを起こすため、BIO形式に変換する際に補正する必要がある
list_text_id =  [tokenizer.encode(text, truncation=True, padding='max_length', max_length=n_token) for text in list_text]
list_tokens = [tokenizer.convert_ids_to_tokens(encode) for encode in list_text_id]

In [None]:
def is_in_span(idx, span):
  return span[0] <= idx and idx < span[1]

In [None]:
from collections import defaultdict
list_bio =[]

label2id = defaultdict(lambda :len(label2id))
_ = label2id["O"]

for text, tokens, entities in zip(list_text, list_tokens, list_entities):
  bio = ["O"] * len(tokens)

  for entity in entities:
    cnt = 0
    begin_flg = True
    label = entity["type"]

    for i, tok in enumerate(tokens):
      if tok == "[CLS]" or tok == "[SEP]" or tok == "[PAD]":
        continue
      elif is_in_span(cnt, entity["span"]):
        if begin_flg:
          bio[i] = f"B-{label}"
          begin_flg = False
          _ = label2id[f"B-{label}"]
          _ = label2id[f"I-{label}"]
        else:
          bio[i] = f"I-{label}"

      cnt += len(tok.replace("##", ""))

  list_bio.append(bio)

id2label = {v:k for k,v in label2id.items()}

In [None]:
list_bio_id = [[label2id[label] for label in bio] for bio in list_bio]

In [None]:
# BIO形式に変換できていることの確認
for dec, bio, _ in zip(list_tokens[0], list_bio[0], range(15)):
  print(f"{dec}\t{bio}")

[CLS]	O
SP	B-その他の組織名
##R	I-その他の組織名
##i	I-その他の組織名
##N	I-その他の組織名
##GS	I-その他の組織名
と	O
最も	O
仲	O
の	O
良い	O
ライバル	O
グループ	O
。	O
[SEP]	O


In [None]:
len(list_bio)

5343

In [None]:
import torch

# 使用デバイスにGPUを設定
# 以下のような出力が出ていれば正常に設定ができている
# device(type='cuda', index=0)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [None]:
from torch.utils.data import Dataset, DataLoader

class NERDataset(Dataset):
  def __init__(self, texts_id, bios_id, is_test=False):
    self.texts_id = texts_id
    self.bios_id = bios_id
    self.is_test = is_test

  def __getitem__(self, idx):
    data = {'input_ids': torch.tensor(self.texts_id[idx], device=device)}
    if not self.is_test:
      data['label'] = torch.tensor(self.bios_id[idx], device=device)
    return data

  def __len__(self):
    return len(self.bios_id)

In [None]:
from sklearn.model_selection import train_test_split
n_test, n_valid = int(len(list_bio) * 0.2), int(len(list_bio) * 0.1)

list_text_id_train, list_text_id_test, list_bio_id_train, list_bio_id_test = \
    train_test_split(list_text_id, list_bio_id, test_size=n_test, random_state=0)
list_text_id_train, list_text_id_valid, list_bio_id_train, list_bio_id_valid = \
    train_test_split(list_text_id_train, list_bio_id_train, test_size=n_valid, random_state=0)

ds_train = NERDataset(list_text_id_train, list_bio_id_train)
ds_valid = NERDataset(list_text_id_valid, list_bio_id_valid)
ds_test = NERDataset(list_text_id_test, list_bio_id_test, is_test=True)

In [None]:
print(len(ds_train), len(ds_valid), len(ds_test))

3741 534 1068


In [None]:
def padding_fn(batch):
    x = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True)
    return x

##固有表現抽出モデルの作成

In [None]:
from transformers import BertForTokenClassification
model = BertForTokenClassification.from_pretrained(
    'cl-tohoku/bert-base-japanese-whole-word-masking',
    id2label=id2label,
    label2id=label2id
    )

model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the m

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
from transformers import Trainer, TrainingArguments
!mkdir -p /content/drive/MyDrive/bert/5_2_wikipedia_ner/results

In [None]:
training_config = TrainingArguments(
  output_dir = './results',
  num_train_epochs = 1,
  per_device_train_batch_size = 8,
  per_device_eval_batch_size = 8,
  warmup_steps = 500,
  weight_decay = 0.1,
  save_steps = 500,
  do_eval = True,
  eval_steps = 500
)

trainer = Trainer(
    model = model,
    args = training_config,
    tokenizer = tokenizer,
    train_dataset = ds_train,
    eval_dataset = ds_valid
)

In [None]:
!rm -r ./results/*

In [None]:
trainer.train()

***** Running training *****
  Num examples = 3741
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 468


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=468, training_loss=0.17922264490372095, metrics={'train_runtime': 182.5195, 'train_samples_per_second': 20.496, 'train_steps_per_second': 2.564, 'total_flos': 488821865614848.0, 'train_loss': 0.17922264490372095, 'epoch': 1.0})

## テストデータの推論

In [None]:
result = trainer.predict(ds_test)

***** Running Prediction *****
  Num examples = 1068
  Batch size = 8


In [None]:
# 固有表現抽出の評価用ライブラリをインストール
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.7 MB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=c554046529d7f091624c2090ab70bbcae8435e54b50d50e316c003b91fede017
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import numpy as np

trues =  np.vectorize(lambda x:id2label[x])(ds_test.bios_id).tolist()

preds_id = np.argmax(result.predictions, axis=2)
preds = np.vectorize(lambda x:id2label[x])(preds_id).tolist()

In [None]:
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

print(f1_score(trues, preds))

print(classification_report(trues, preds))

0.7588136541689983
              precision    recall  f1-score   support

     その他の組織名       0.72      0.66      0.69       237
       イベント名       0.74      0.81      0.78       215
          人名       0.85      0.85      0.85       549
          地名       0.83      0.78      0.81       447
      政治的組織名       0.66      0.70      0.68       263
         施設名       0.77      0.80      0.79       241
         法人名       0.83      0.79      0.81       487
         製品名       0.48      0.48      0.48       252

   micro avg       0.76      0.76      0.76      2691
   macro avg       0.74      0.74      0.74      2691
weighted avg       0.76      0.76      0.76      2691

