<a href="https://colab.research.google.com/github/ftnext/practice-dl-nlp/blob/master/bert_exercise/transformers_examples/20220518_legacy_ner_ner_wikipedia_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

ref: https://github.com/stockmarkteam/bert-book/blob/master/Chapter8.ipynb

In [1]:
!git clone https://github.com/huggingface/transformers.git
!cd transformers && git checkout v4.19.2&& pip install .[ja]

Cloning into 'transformers'...
remote: Enumerating objects: 95610, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 95610 (delta 17), reused 4 (delta 2), pack-reused 95562[K
Receiving objects: 100% (95610/95610), 88.71 MiB | 24.39 MiB/s, done.
Resolving deltas: 100% (70293/70293), done.
Note: checking out 'v4.19.2'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at 6e535425f Release: v4.19.2
Processing /content/transformers
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary d

In [2]:
!pip install seqeval filelock conllu

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l[K     |███████▌                        | 10 kB 24.5 MB/s eta 0:00:01[K     |███████████████                 | 20 kB 8.0 MB/s eta 0:00:01[K     |██████████████████████▌         | 30 kB 3.9 MB/s eta 0:00:01[K     |██████████████████████████████  | 40 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████████████| 43 kB 934 kB/s 
Collecting conllu
  Downloading conllu-4.4.2-py2.py3-none-any.whl (15 kB)
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=53f94305dcfbd872ae9249d374fb9e9e25497cf404244e1ccf19b85173460232
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval, conllu
Successfully installed conllu-4.4.2 seqeval-1.2.2


In [3]:
import json
import random

from transformers import BertJapaneseTokenizer

In [4]:
TOKENIZER = BertJapaneseTokenizer.from_pretrained(
    "cl-tohoku/bert-base-japanese-whole-word-masking"
)

Downloading:   0%|          | 0.00/252k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/110 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/479 [00:00<?, ?B/s]

In [5]:
def distinguish_entities(text, entities):
    """固有表現の前後でテキストを分割する（形態素解析はしない）

    >>> distinguish_entities("ロレム・イプサム", [])
    [{'text': 'ロレム・イプサム', 'label': 'O'}]
    >>> entities1 = [{"name": "B大学", "span": [0, 3], "type": "法人名"}]
    >>> distinguish_entities("B大学に入学した。", entities1)
    [{'text': 'B大学', 'label': '法人名'}, {'text': 'に入学した。', 'label': 'O'}]
    >>> entities2 = [{"name": "A", "span": [0, 1], "type": "人名"}, {"name": "B大学", "span": [4, 7], "type": "法人名"}]
    >>> distinguish_entities("AさんはB大学を卒業した。", entities2)
    [{'text': 'A', 'label': '人名'}, {'text': 'さんは', 'label': 'O'}, {'text': 'B大学', 'label': '法人名'}, {'text': 'を卒業した。', 'label': 'O'}]
    """
    splitted = []
    position = 0
    for entity in entities:
        start = entity["span"][0]
        end = entity["span"][1]
        label = entity["type"]
        splitted.append({"text": text[position:start], "label": "O"})
        splitted.append({"text": text[start:end], "label": label})
        position = end
    splitted.append({"text": text[position:], "label": "O"})
    return [s for s in splitted if s["text"]]

In [6]:
def tokenize_splitted(splitted, tokenizer):
    """形態素解析したトークンと対応する固有表現のBIOタグを返す

    >>> splitted = [{'text': 'A', 'label': '人名'}, {'text': 'さんは', 'label': 'O'}, {'text': 'B大学', 'label': '法人名'}, {'text': 'を卒業した。', 'label': 'O'}]
    >>> tokenize_splitted(splitted, TOKENIZER)
    (['A', 'さん', 'は', 'B', '大学', 'を', '卒業', 'し', 'た', '。'], ['B-人名', 'O', 'O', 'B-法人名', 'I-法人名', 'O', 'O', 'O', 'O', 'O'])
    """
    tokens, labels = [], []
    for s in splitted:
        part_tokens = tokenizer.tokenize(s["text"])
        label = s["label"]
        if label == "O":
            part_labels = [label] * len(part_tokens)
        else:  # 固有表現
            part_labels = [f"I-{label}"] * len(part_tokens)
            part_labels[0] = f"B-{label}"
        tokens.extend(part_tokens)
        labels.extend(part_labels)
    return tokens, labels

In [7]:
def convert_germeval_format(obj):
    """ner-wikipedia-datasetをGermEval2014の形式に変換する

    >>> list(convert_germeval_format({"curid": "345", "text": "AさんはB大学を卒業した。", "entities": [{"name": "A", "span": [0, 1], "type": "人名"}, {"name": "B大学", "span": [4, 7], "type": "法人名"}]}))
    ['A B-人名', 'さん O', 'は O', 'B B-法人名', '大学 I-法人名', 'を O', '卒業 O', 'し O', 'た O', '。 O', '']
    """
    splitted = distinguish_entities(obj["text"], obj["entities"])
    tokens, labels = tokenize_splitted(splitted, TOKENIZER)
    for token, label in zip(tokens, labels):
        yield f"{token} {label}"
    yield ""

In [8]:
def convert_dataset(dataset):
    for d in dataset:
        yield from convert_germeval_format(d)

In [9]:
def save_dataset(dataset, path):
    with open(path, "w") as f:
        for row in dataset:
            f.write(f"{row}\n")

In [10]:
!mkdir -p /tmp/ner-wikipedia-dataset/{source,preprocessed}

In [11]:
!curl --output /tmp/ner-wikipedia-dataset/ner.json \
  https://raw.githubusercontent.com/stockmarkteam/ner-wikipedia-dataset/v2.0/ner.json

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3947k  100 3947k    0     0  11.6M      0 --:--:-- --:--:-- --:--:-- 11.6M


In [12]:
with open("/tmp/ner-wikipedia-dataset/ner.json", encoding="utf8") as f:
    dataset = json.load(f)

In [13]:
len(dataset)

5343

In [14]:
random.seed(42)
random.shuffle(dataset)

n = len(dataset)
n_train = int(n * 0.6)
n_val = int(n * 0.2)
dataset_train = dataset[:n_train]
dataset_val = dataset[n_train : n_train + n_val]
dataset_test = dataset[n_train + n_val :]
print(len(dataset_train), len(dataset_val), len(dataset_test))

3205 1068 1070


In [15]:
save_dataset(
    convert_dataset(dataset_train), "/tmp/ner-wikipedia-dataset/source/train.txt",
)
save_dataset(
    convert_dataset(dataset_val), "/tmp/ner-wikipedia-dataset/source/dev.txt",
)
save_dataset(
    convert_dataset(dataset_test), "/tmp/ner-wikipedia-dataset/source/test.txt",
)

In [16]:
!wc -l /tmp/ner-wikipedia-dataset/source/*.txt

  38947 /tmp/ner-wikipedia-dataset/source/dev.txt
  38093 /tmp/ner-wikipedia-dataset/source/test.txt
 114565 /tmp/ner-wikipedia-dataset/source/train.txt
 191605 total


In [17]:
!cd transformers/examples/legacy/token-classification && python scripts/preprocess.py \
  /tmp/ner-wikipedia-dataset/source/train.txt cl-tohoku/bert-base-japanese-whole-word-masking 128 > /tmp/ner-wikipedia-dataset/preprocessed/train.txt

In [18]:
!cd transformers/examples/legacy/token-classification && python scripts/preprocess.py \
  /tmp/ner-wikipedia-dataset/source/dev.txt cl-tohoku/bert-base-japanese-whole-word-masking 128 > /tmp/ner-wikipedia-dataset/preprocessed/dev.txt

In [19]:
!cd transformers/examples/legacy/token-classification && python scripts/preprocess.py \
  /tmp/ner-wikipedia-dataset/source/test.txt cl-tohoku/bert-base-japanese-whole-word-masking 128 > /tmp/ner-wikipedia-dataset/preprocessed/test.txt

In [20]:
# 特定の語彙を除くのが目的と認識しているが、結果、語彙数が増えている（TODO）
!wc -l /tmp/ner-wikipedia-dataset/preprocessed/*.txt

  38957 /tmp/ner-wikipedia-dataset/preprocessed/dev.txt
  38099 /tmp/ner-wikipedia-dataset/preprocessed/test.txt
 114591 /tmp/ner-wikipedia-dataset/preprocessed/train.txt
 191647 total


In [21]:
!cd /tmp/ner-wikipedia-dataset/preprocessed && cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt

In [22]:
!cat /tmp/ner-wikipedia-dataset/preprocessed/labels.txt

B-人名
B-その他の組織名
B-イベント名
B-地名
B-政治的組織名
B-施設名
B-法人名
B-製品名
I-人名
I-その他の組織名
I-イベント名
I-地名
I-政治的組織名
I-施設名
I-法人名
I-製品名
O


In [23]:
!wc -l /tmp/ner-wikipedia-dataset/preprocessed/labels.txt

17 /tmp/ner-wikipedia-dataset/preprocessed/labels.txt


In [24]:
!cd transformers/examples/legacy/token-classification && python run_ner.py \
--data_dir /tmp/ner-wikipedia-dataset/preprocessed \
--labels /tmp/ner-wikipedia-dataset/preprocessed/labels.txt \
--model_name_or_path cl-tohoku/bert-base-japanese-whole-word-masking \
--output_dir /tmp/ner-wikipedia \
--max_seq_length 128 \
--num_train_epochs 5 \
--per_device_train_batch_size 32 \
--logging_steps 50 \
--save_steps 100 \
--save_total_limit 2 \
--per_device_eval_batch_size 256 \
--seed 1 \
--do_train \
--do_eval \
--do_predict

05/17/2022 23:39:56 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=True,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics

「鉄 # ### 腕 アトム」🤔

In [25]:
!ls -l /tmp/ner-wikipedia

total 430584
drwxr-xr-x 2 root root      4096 May 17 23:45 checkpoint-400
drwxr-xr-x 2 root root      4096 May 17 23:46 checkpoint-500
-rw-r--r-- 1 root root      1988 May 17 23:46 config.json
-rw-r--r-- 1 root root       269 May 17 23:46 eval_results.txt
-rw-r--r-- 1 root root 440229745 May 17 23:46 pytorch_model.bin
drwxr-xr-x 3 root root      4096 May 17 23:40 runs
-rw-r--r-- 1 root root       112 May 17 23:46 special_tokens_map.json
-rw-r--r-- 1 root root    388470 May 17 23:47 test_predictions.txt
-rw-r--r-- 1 root root       255 May 17 23:47 test_results.txt
-rw-r--r-- 1 root root       474 May 17 23:46 tokenizer_config.json
-rw-r--r-- 1 root root      3183 May 17 23:46 training_args.bin
-rw-r--r-- 1 root root    257706 May 17 23:46 vocab.txt


In [26]:
!cat /tmp/ner-wikipedia/eval_results.txt

eval_loss = 0.1069055050611496
eval_accuracy_score = 0.9752633385253042
eval_precision = 0.8606060606060606
eval_recall = 0.8994038748137109
eval_f1 = 0.8795773364911641
eval_runtime = 10.3723
eval_samples_per_second = 103.931
eval_steps_per_second = 0.482
epoch = 5.0


In [27]:
!cat /tmp/ner-wikipedia/test_results.txt

test_loss = 0.1237124502658844
test_accuracy_score = 0.9729627528833428
test_precision = 0.845782237086585
test_recall = 0.8821705426356589
test_f1 = 0.8635932460633656
test_runtime = 10.4221
test_samples_per_second = 103.242
test_steps_per_second = 0.48


ここでドライブをマウント

In [28]:
!mkdir -p drive/MyDrive/nlp/20220518/ner-wikipedia/

In [29]:
!cp /tmp/ner-wikipedia/*.* drive/MyDrive/nlp/20220518/ner-wikipedia/