In [1]:
# specify the directory where the model files are stored
BERT_BASE_DIR = '/path/to/mecab-ipadic-bpe-32k/do-whole-word-mask'

In [2]:
import torch
from transformers import BertForMaskedLM
from tokenization import MecabBertTokenizer, MecabCharacterBertTokenizer

I1106 11:10:32.632133 4675833280 file_utils.py:32] TensorFlow version 2.0.0 available.
I1106 11:10:32.632836 4675833280 file_utils.py:39] PyTorch version 1.3.0.post2 available.
I1106 11:10:32.935988 4675833280 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:
model = BertForMaskedLM.from_pretrained(BERT_BASE_DIR)

I1106 11:10:35.046508 4675833280 configuration_utils.py:148] loading configuration file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/do-whole-word-mask/config.json
I1106 11:10:35.047818 4675833280 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 32000
}

I1106 11:10:35.049305 4675833280 modeling_utils.py:334] loading weights file /Users/m-suzuki/work/japanese-bert/jawiki-20190901/mecab-ipadic-bpe-32k/do-whole-word-mask/pytorch_model.bin
I1106

In [4]:
tokenizer = MecabBertTokenizer(vocab_file=f'{BERT_BASE_DIR}/vocab.txt')
# Use MecabCharacterBertTokenizer instead for char-4k models

In [5]:
text = '朝食に[MASK]を焼いて食べました。'

In [6]:
token_ids = tokenizer.encode(text, add_special_tokens=True)

In [7]:
token_ids

[2, 25965, 7, 4, 11, 16878, 16, 2949, 3913, 10, 8, 3]

In [8]:
tokens = tokenizer.convert_ids_to_tokens(token_ids)

In [9]:
tokens

['[CLS]', '朝食', 'に', '[MASK]', 'を', '焼い', 'て', '食べ', 'まし', 'た', '。', '[SEP]']

In [10]:
token_ids = torch.tensor([token_ids])

In [11]:
token_ids

tensor([[    2, 25965,     7,     4,    11, 16878,    16,  2949,  3913,    10,
             8,     3]])

In [12]:
predictions, = model(token_ids)
_, top10_pred_ids = torch.topk(predictions, k=10, dim=2)

for correct_id, pred_ids in zip(token_ids[0], top10_pred_ids[0]):
    correct_token = tokenizer.convert_ids_to_tokens([correct_id.item()])
    pred_tokens = tokenizer.convert_ids_to_tokens(pred_ids.tolist())
    print(correct_token, pred_tokens)

['[CLS]'] ['た', '」', '朝食', 'て', 'お', '、', 'まし', 'です', '朝', '。']
['朝食'] ['朝食', '朝', '夕食', '早朝', '最初', '午後', '昼', '代わり', '食事', '夕方']
['に'] ['に', 'は', 'として', 'の', '用', 'を', '中', '後', 'で', 'にかけて']
['[MASK]'] ['パン', '肉', 'ご飯', '豚肉', 'ハム', '野菜', '牛肉', '[UNK]', 'ケーキ', 'バター']
['を'] ['を', '##パン', 'は', '##焼き', 'に', '##肉', 'パン', '##を', '##ト', 'で']
['焼い'] ['焼い', '焼く', '焼き', '作っ', 'し', '燃やし', '巻い', '使っ', '揚げ', '買っ']
['て'] ['て', 'で', '##て', 'ながら', 'たら', 'を', 'から', 'って', 'た', 'に']
['食べ'] ['食べ', 'い', '飲み', 'おり', '待ち', '食べる', '食', '食事', '見', 'もらい']
['まし'] ['まし', 'でし', 'ましょ', 'ませ', 'ます', 'です', 'し', 'られ', 'て', 'でしょ']
['た'] ['た', 'て', '」', 'ます', 'まし', 'し', 'たい', 'たら', 'ん', 'たり']
['。'] ['。', '」', '!', 'が', '...。', '、', 'から', ':', '......', 'よ']
['[SEP]'] ['。', 'て', '、', 'に', '(', 'た', 'は', '」', 'し', ')。']
