In [1]:
import os
import json
import torch
import torch.nn as nn
import numpy as np
from transformers import BertModel, BertConfig, BertTokenizer
from gluonnlp.vocab import BERTVocab

In [2]:
bert_config = {
    'attention_probs_dropout_prob': 0.1,
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.1,
    'hidden_size': 768,
    'initializer_range': 0.02,
    'intermediate_size': 3072,
    'max_position_embeddings': 512,
    'num_attention_heads': 12,
    'num_hidden_layers': 12,
    'type_vocab_size': 2,
    'vocab_size': 8002
}

In [4]:
with open('../.kobert/kobert-8002-config.json', 'w') as f:
    json.dump(bert_config, f, indent=4)

In [3]:
bert_model = BertModel(config=BertConfig.from_dict(bert_config))
assert isinstance(bert_model, nn.Module)

In [4]:
# Load weights
model_file = '../.kobert/pytorch_kobert_2439f391a6.params'
assert os.path.exists(model_file)

bert_model.load_state_dict(torch.load(model_file))

<All keys matched successfully>

In [5]:
# load dictionary
vocab_file = '../.kobert/kobertvocab_f38b8a4d6d.json'
assert os.path.exists(vocab_file)

with open(vocab_file, 'rt') as f:
    vocab = json.load(f)
    print('Available keys of vocab dictionary:\n- ', end='')
    print(*list(vocab.keys()), sep='\n- ')

Available keys of vocab dictionary:
- idx_to_token
- token_to_idx
- reserved_tokens
- unknown_token
- padding_token
- bos_token
- eos_token
- mask_token
- sep_token
- cls_token


In [6]:
# gluonnlp tokenizer
gluon_tokenizer = BERTVocab.from_json(open(vocab_file, 'rt').read())
print(f"Vocab size (gluonnlp, BERT): {len(gluon_tokenizer.idx_to_token)}")

Vocab size (gluonnlp, BERT): 8002


In [7]:
for k, v in vocab.items():
    if k in ['idx_to_token', 'token_to_idx']:
        print(f"{k:<16}: {len(v)}")
        continue
    print(f"{k:<16}: {v}")

idx_to_token    : 8002
token_to_idx    : 8002
reserved_tokens : ['[MASK]', '[SEP]', '[CLS]']
unknown_token   : [UNK]
padding_token   : [PAD]
bos_token       : None
eos_token       : None
mask_token      : [MASK]
sep_token       : [SEP]
cls_token       : [CLS]


In [8]:
# Write to vocab to file (one token per line; huggingface format)
vocab_file_huggingface = '../.kobert/kobert_vocab_huggingface_format.txt'
with open(vocab_file_huggingface, 'wt', encoding='utf-8') as f:
    f.write('\n'.join(vocab.get('idx_to_token')))

In [9]:
# Instantiate BertTokenizer
tokenizer_configs = {
    'vocab_file': vocab_file_huggingface,
    'do_lower_case': False,
    'do_basic_tokenize': True,
    'never_split': None,
    'unk_token': '[UNK]',
    'sep_token': '[SEP]',
    'pad_token': '[PAD]',
    'cls_token': '[CLS]',
    'mask_token': '[MASK]',
    'tokenize_chinese_chars': False,
}

bert_tokenizer = BertTokenizer(**tokenizer_configs)
print(f'Vocab size (huggingface, BERT): {bert_tokenizer.vocab_size}')

Vocab size (huggingface, BERT): 8002


In [10]:
# Gluonnlp vs. Huggingface
for i in np.random.randint(0, 8002, 10):
    gluon_ver = gluon_tokenizer.idx_to_token[i]
    huggingface_ver = bert_tokenizer.ids_to_tokens[i]
    print(f"Index {i:>04}: {gluon_ver} vs. {huggingface_ver}")

Index 4638: ▁카리스마 vs. ▁카리스마
Index 0876: ▁거짓말 vs. ▁거짓말
Index 1661: ▁대중 vs. ▁대중
Index 0635: ▁A vs. ▁A
Index 3642: ▁의한 vs. ▁의한
Index 7047: 위는 vs. 위는
Index 1680: ▁대학 vs. ▁대학
Index 4375: ▁진행될 vs. ▁진행될
Index 1464: ▁넘는 vs. ▁넘는
Index 2972: ▁시리아 vs. ▁시리아


In [11]:
# Replace '_' with '#' ('__' with '##')
tokens = []
for token in vocab['idx_to_token']:
    if token == '▁':
        tokens.append(token)
    else:
        token = token.replace('▁', '##')
        tokens.append(token)

In [12]:
new_vocab_file_huggingface = '../.kobert/new_kobert_vocab_huggingface_format.txt'
with open(new_vocab_file_huggingface, 'wt', encoding='utf-8') as f:
    f.write('\n'.join(tokens))

In [13]:
# Reload BERT tokenizer
tokenizer_configs['vocab_file'] = new_vocab_file_huggingface
bert_tokenizer = BertTokenizer(**tokenizer_configs)

In [14]:
# Gluonnlp vs. Huggingface (AGAIN!)
for i in np.random.randint(0, 8002, 10):
    gluon_ver = gluon_tokenizer.idx_to_token[i]
    huggingface_ver = bert_tokenizer.ids_to_tokens[i]
    print(f"Index {i:>04}: {gluon_ver} vs. {huggingface_ver}")

Index 7499: 칸 vs. 칸
Index 1318: ▁김동 vs. ##김동
Index 6006: 라도 vs. 라도
Index 7399: 참여 vs. 참여
Index 4439: ▁창단 vs. ##창단
Index 2098: ▁무대 vs. ##무대
Index 0854: ▁개최 vs. ##개최
Index 4454: ▁채무 vs. ##채무
Index 7080: 으로서 vs. 으로서
Index 1810: ▁들어오 vs. ##들어오


In [15]:
# Test
text = "그런데 왜 결과는 이런 식이야?"
tokenized_text = bert_tokenizer.tokenize(text)
print(f"Original text:\n-> {text}")
print(f"Tokenized text:\n-> {tokenized_text}")

Original text:
-> 그런데 왜 결과는 이런 식이야?
Tokenized text:
-> ['그런', '##데', '왜', '[UNK]', '[UNK]', '식', '##이', '##야', '?']


In [16]:
# Drill down
for token in bert_tokenizer.basic_tokenizer.tokenize(text):
    for sub_token in bert_tokenizer.wordpiece_tokenizer.tokenize(token):
        print(sub_token, end=' ')

그런 ##데 왜 [UNK] [UNK] 식 ##이 ##야 ? 

In [17]:
# Write tokenizer configs to json file
with open('../.kobert/kobert_tokenizer_config.json', 'w') as f:
    json.dump(tokenizer_configs, f, indent=4)

In [18]:
# Save BERT model using .save_pretrained -> for later usage
save_dir = '../.kobert/pretrained/'
os.makedirs(save_dir, exist_ok=True)
bert_model.save_pretrained(save_directory=save_dir)