<a href="https://colab.research.google.com/github/ejpark78/codelab/blob/master/bert/huggingface_konlpy/00_huggingface_tokenizers_usage.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!echo "172.19.153.41 nlp-utils" >> /etc/hosts

출처: https://github.com/lovit/huggingface_konlpy/blob/master/tutorials/00_huggingface_tokenizers_usage.ipynb

In [None]:
# git clone
!git clone https://github.com/lovit/huggingface_konlpy.git

In [None]:
!pip install -r huggingface_konlpy/requirements.txt
!pip freeze | grep transformers

In [None]:
!pip install transformers==3.0.2 
!pip install tokenizers==0.8.1

In [1]:
import tokenizers
tokenizers.__version__

'0.8.1'

In [2]:
from tokenizers import (ByteLevelBPETokenizer,
                        CharBPETokenizer,
                        SentencePieceBPETokenizer,
                        BertWordPieceTokenizer)

small_corpus = './data/very_small_corpus.txt'

## Bert WordPiece Tokenizer

In [3]:
from tokenizers import BertWordPieceTokenizer

bert_wordpiece_tokenizer = BertWordPieceTokenizer(lowercase = False)
bert_wordpiece_tokenizer.train(files=[small_corpus], vocab_size=10)

encoding = bert_wordpiece_tokenizer.encode('ABCDE')

print(encoding.tokens)
print(encoding.ids)

['A', '##B', '##C', '##D', '##E']
[5, 11, 13, 12, 14]


In [4]:
from tokenizers import BertWordPieceTokenizer

bert_wordpiece_tokenizer = BertWordPieceTokenizer()
bert_wordpiece_tokenizer.train(
    files = [small_corpus],
    vocab_size = 10,
    min_frequency = 1,
    limit_alphabet = 1000,
    initial_alphabet = [],
    special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    show_progress = True,
    wordpieces_prefix = "##",
)

vocab = bert_wordpiece_tokenizer.get_vocab()

print(sorted(vocab, key=lambda x: vocab[x]))

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', '##c', '##b', '##d', '##e', '##f']


In [5]:
encoding = bert_wordpiece_tokenizer.encode('ABCDE')

print(encoding.tokens)
print(encoding.ids)

['a', '##b', '##c', '##d', '##e']
[5, 12, 11, 13, 14]


In [6]:
bert_wordpiece_tokenizer = BertWordPieceTokenizer()
bert_wordpiece_tokenizer.train(
    files = [small_corpus],
    vocab_size = 20,
    min_frequency = 1,
    initial_alphabet = ['g'],
)

vocab = bert_wordpiece_tokenizer.get_vocab()

print(sorted(vocab, key=lambda x: vocab[x]))

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', '##f', '##c', '##b', '##d', '##e', 'ab', 'abc', 'af']


In [7]:
encoding = bert_wordpiece_tokenizer.encode('ABCDE')

print(encoding.tokens)
print(encoding.ids)

['abc', '##d', '##e']
[18, 15, 16]


In [8]:
encodings = bert_wordpiece_tokenizer.encode_batch(['ABCDE', 'abcd'])

print(encodings[0].tokens)
print(encodings[1].tokens)

['abc', '##d', '##e']
['abc', '##d']


In [10]:
bert_wordpiece_tokenizer.save_model(
    directory = './model/',
    name = 'very_small_bertwordpiece'
)
# ['./very_small_bertwordpiece-vocab.txt']

['./model/very_small_bertwordpiece-vocab.txt']

In [11]:
bert_wordpiece_tokenizer = BertWordPieceTokenizer(
    vocab_file = './model/very_small_bertwordpiece-vocab.txt'
)
bert_wordpiece_tokenizer.encode('ABCDE').tokens

['[CLS]', 'abc', '##d', '##e', '[SEP]']

In [12]:
bert_wordpiece_tokenizer.encode('ABCDE', add_special_tokens=False).tokens

['abc', '##d', '##e']

In [13]:
bert_wordpiece_tokenizer.encode(
    sequence = 'abcde',
    pair = 'abcd'
).tokens

['[CLS]', 'abc', '##d', '##e', '[SEP]', 'abc', '##d', '[SEP]']

In [14]:
bert_wordpiece_tokenizer.add_tokens(['lovit'])

bert_wordpiece_tokenizer.save_model(
    directory = './model/',
    name = 'very_small_bertwordpiece_lovit')

bert_wordpiece_tokenizer = BertWordPieceTokenizer(
    vocab_file = './model/very_small_bertwordpiece_lovit-vocab.txt'
)

vocab = bert_wordpiece_tokenizer.get_vocab()

print(sorted(vocab, key=lambda x: vocab[x]))

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'a', 'b', 'c', 'd', 'e', 'f', 'g', '##f', '##c', '##b', '##d', '##e', 'ab', 'abc', 'af']


In [15]:
bert_wordpiece_tokenizer = BertWordPieceTokenizer(vocab_file='./model/very_small_bertwordpiece_lovit2-vocab.txt')

bert_wordpiece_tokenizer.encode('ABCDE abg lovit').tokens

['[CLS]', 'abc', '##d', '##e', '[UNK]', 'lovit', '[SEP]']

## SentencePiece BPE Tokenizer

In [16]:
sentencepiece_tokenizer = SentencePieceBPETokenizer(add_prefix_space = True)
sentencepiece_tokenizer.train(
    files = [small_corpus],
    vocab_size = 20,
    min_frequency = 1,
    special_tokens = ['<unk>'],
)

vocab = sentencepiece_tokenizer.get_vocab()

print(sorted(vocab, key=lambda x: vocab[x]))

['<unk>', 'A', 'B', 'C', 'D', 'E', 'F', '▁', '▁A', '▁AB', '▁ABC', 'DE', '▁DE', '▁AC', '▁AF', '▁ABD', '▁ABCDE']


In [17]:
sentencepiece_tokenizer = SentencePieceBPETokenizer(
    add_prefix_space = False
)

sentencepiece_tokenizer.train(
    files = [small_corpus],
    vocab_size = 20,
    min_frequency = 1,
    special_tokens = ['<unk>', 'lovit'],
)

vocab = sentencepiece_tokenizer.get_vocab()

print(sorted(vocab, key=lambda x: vocab[x]))

['<unk>', 'lovit', 'A', 'B', 'C', 'D', 'E', 'F', '▁', '▁A', '▁AB', 'DE', '▁ABC', 'AB', 'CDE', '▁AC', '▁AF', '▁ABD', 'ABCDE']


In [18]:
sentencepiece_tokenizer.save_model('./model/', 'very_small_sentencepiece')

['./model/very_small_sentencepiece-vocab.json',
 './model/very_small_sentencepiece-merges.txt']

In [19]:
sentencepiece_tokenizer = SentencePieceBPETokenizer(
    vocab_file = './model/very_small_sentencepiece-vocab.json',
    merges_file = './model/very_small_sentencepiece-merges.txt'
)

sentencepiece_tokenizer.encode('ABCDE').tokens

['▁ABC', 'DE']

## Character BPE Tokenizer

In [20]:
charbpe_tokenizer = CharBPETokenizer(suffix='</w>')
charbpe_tokenizer.train(
    files = [small_corpus],
    vocab_size = 15,
    min_frequency = 1
)
charbpe_tokenizer.encode('ABCDE.ABC').tokens

['AB', 'C', 'DE</w>', 'ABC</w>']

In [21]:
charbpe_tokenizer = CharBPETokenizer(
    suffix='</w>',
    split_on_whitespace_only = True
)
charbpe_tokenizer.train(
    files = [small_corpus],
    vocab_size = 15,
    min_frequency = 1
)
charbpe_tokenizer.encode('ABCDE.ABC').tokens

['AB', 'C', 'D', 'E', 'ABC</w>']

In [22]:
charbpe_tokenizer.train?

In [23]:
charbpe_tokenizer = CharBPETokenizer(
    suffix='</w>',
    split_on_whitespace_only = True
)
charbpe_tokenizer.train(
    files = [small_corpus],
    vocab_size = 15,
    min_frequency = 1,
)
charbpe_tokenizer.encode('ABCDE.ABC').tokens

['AB', 'C', 'D', 'E', 'ABC</w>']

In [24]:
charbpe_tokenizer.encode('ABCDEFGH').tokens

['AB', 'C', 'D', 'E', 'F']

## Byte-level BPE Tokenizer

- OpenAI GPT2 tokenizer

In [25]:
bytebpe_tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
bytebpe_tokenizer.train(files = [small_corpus],
    vocab_size = 1000, min_frequency = 1)
vocab = bytebpe_tokenizer.get_vocab()
print(sorted(vocab, key=lambda x: vocab[x]))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ', 'Ā', 'ā', 'Ă', 'ă', 'Ą', 'ą', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ'

In [26]:
bytebpe_tokenizer.encode('ABCDE ABC').tokens

['ĠABCDE', 'ĠABC']

## 코로나19 관련 뉴스를 학습해 보자.

In [28]:
!mkdir -p ./model/ByteLevelBPETokenizer/

In [29]:
from tokenizers import (ByteLevelBPETokenizer,
                        CharBPETokenizer,
                        SentencePieceBPETokenizer,
                        BertWordPieceTokenizer)

corpus_path = './data/2020-07-29_covid_news_sents.txt'
vocab_size = 3000

byte_level_bpe_tokenizer = ByteLevelBPETokenizer()
byte_level_bpe_tokenizer.train(files=[corpus_path], vocab_size=vocab_size)
byte_level_bpe_tokenizer.save_model(directory='./model/ByteLevelBPETokenizer/', name='covid')

['./model/ByteLevelBPETokenizer/covid-vocab.json',
 './model/ByteLevelBPETokenizer/covid-merges.txt']

In [30]:
!mkdir -p ./model/CharBPETokenizer/

In [31]:
char_bpe_tokenizer = CharBPETokenizer()
char_bpe_tokenizer.train(files=[corpus_path], vocab_size=vocab_size)
char_bpe_tokenizer.save_model(directory='./model/CharBPETokenizer/', name='covid')

['./model/CharBPETokenizer/covid-vocab.json',
 './model/CharBPETokenizer/covid-merges.txt']

In [32]:
!mkdir -p ./model/SentencePieceBPETokenizer/

In [33]:
sentencepiece_bpe_tokenizer = SentencePieceBPETokenizer()
sentencepiece_bpe_tokenizer.train(files=[corpus_path], vocab_size=vocab_size)
sentencepiece_bpe_tokenizer.save_model(directory='./model/SentencePieceBPETokenizer/', name='covid')

['./model/SentencePieceBPETokenizer/covid-vocab.json',
 './model/SentencePieceBPETokenizer/covid-merges.txt']

In [34]:
!mkdir -p ./model/BertWordPieceTokenizer/

In [35]:
bert_wordpiece_tokenizer = BertWordPieceTokenizer()

bert_wordpiece_tokenizer.train(
    files=[corpus_path], vocab_size=vocab_size)

bert_wordpiece_tokenizer.save_model(
    directory='./model/BertWordPieceTokenizer/', name='covid')

['./model/BertWordPieceTokenizer/covid-vocab.txt']

In [36]:
sent_ko = '신종 코로나바이러스 감염증(코로나19) 사태가 심각합니다'
tokenizers = [bert_wordpiece_tokenizer,
              sentencepiece_bpe_tokenizer,
              char_bpe_tokenizer,
              byte_level_bpe_tokenizer]

for tokenizer in tokenizers:
    encode_single = tokenizer.encode(sent_ko)
    print(f'\n{tokenizer.__class__.__name__}')
    print(f'tokens = {encode_single.tokens}')


BertWordPieceTokenizer
tokens = ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']

SentencePieceBPETokenizer
tokens = ['▁신종', '▁코로나바이러스', '▁감염증(코로나19)', '▁사태', '가', '▁심', '각', '합', '니다']

CharBPETokenizer
tokens = ['신종</w>', '코로나바이러스</w>', '감염증</w>', '(</w>', '코로나19</w>', ')</w>', '사태', '가</w>', '심', '각', '합니다</w>']

ByteLevelBPETokenizer
tokens = ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ëŁ¬ìĬ¤', 'Ġê°ĲìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']


## 학습한 토크나이저를 transformers 에서 이용하자

In [37]:
from transformers import BertTokenizer, GPT2Tokenizer

transformers_bert_tokenizer = BertTokenizer(
    vocab_file = './model/BertWordPieceTokenizer/covid-vocab.txt'
)

print(f'tokenizers  : {bert_wordpiece_tokenizer.encode(sent_ko).tokens}')
print(f'transformers: {transformers_bert_tokenizer.tokenize(sent_ko)}')

tokenizers  : ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
transformers: ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']


In [38]:
for token in bert_wordpiece_tokenizer.encode(sent_ko).tokens[:3]:
    print(f'len({token}) = {len(token)}')

len(신종) = 6
len(코로나바이러스) = 14
len(감염증) = 9


In [39]:
from unicodedata import normalize

print(normalize('NFKD', '가감'))  # 출력 시 글자를 재조합해서 보여줌
print(len(normalize('NFKD', '가감')))
print(normalize('NFKC', normalize('NFKD', '가감')))
print(len(normalize('NFKC', normalize('NFKD', '가감'))))

가감
5
가감
2


In [40]:
from unicodedata import normalize

def compose(tokens):
    return [normalize('NFKC', token) for token in tokens]

print(f'tokenizers  : {compose(bert_wordpiece_tokenizer.encode(sent_ko).tokens)}')
print(f'transformers: {compose(transformers_bert_tokenizer.tokenize(sent_ko))}')

tokenizers  : ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']
transformers: ['신종', '코로나바이러스', '감염증', '(', '코로나19', ')', '사태', '##가', '심', '##각', '##합니다']


In [41]:
transformers_gpt2_tokenizer = GPT2Tokenizer(
    vocab_file = './model/ByteLevelBPETokenizer/covid-vocab.json',
    merges_file = './model/ByteLevelBPETokenizer/covid-merges.txt'
)

print(f'tokenizers  : {byte_level_bpe_tokenizer.encode(sent_ko).tokens}')
print(f'transformers: {transformers_gpt2_tokenizer.tokenize(sent_ko)}')

tokenizers  : ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ëŁ¬ìĬ¤', 'Ġê°ĲìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']
transformers: ['ìĭłì¢ħ', 'Ġì½Ķë¡ľëĤĺë°ĶìĿ´ëŁ¬ìĬ¤', 'Ġê°ĲìĹ¼ì¦Ŀ', '(', 'ì½Ķë¡ľëĤĺ', '19', ')', 'ĠìĤ¬íĥľ', 'ê°Ģ', 'Ġìĭ¬', 'ê°ģ', 'íķ©ëĭĪëĭ¤']
