# SentencePiece VS Huggingface tokenizer

한국어 서브워드 분절 알고리즘 실습&비교

200825

고우영

| 학습시간 비교(s) | 8000   | 16000 | 32000 | 64000 | 128000|
|------|------|------|------|------|------|
|   Sentencepiece  | 11| 22 |48| 110 |282|
|   hugging_face  | 10| 11 |11| 12 |12|

| 추론시간 비교(s) | 8000| 128000|
|------|------|------|
|   Sentencepiece  | 4.5| 4.93 |
|   hugging_face  | 4.9| 4.97 |

# NSMC 데이터셋 로드
## 15만 문장, 113만 word(띄어쓰기 기준), 평균 7.5word/sentence

In [1]:
%%time
# NSMC 데이터 로드
import pandas as pd
f_train = pd.read_csv('data/nsmc.txt', sep='\t')
train_pair = [(row[1], row[2]) for _, row in f_train.iterrows() if type(row[1]) == str]  # nan 제거

#  문장 및 라벨 데이터 추출
train_data  = [pair[0] for pair in train_pair]
train_label = [pair[1] for pair in train_pair]
print('data loading done!')
print('문장: %s' %(train_data[:3]))
print('라벨: %s' %(train_label[:3]))

# subword 학습을 위해 문장만 따로 저장
with open('data/train_tokenizer.txt', 'w', encoding='utf-8') as f:
    for line in train_data:
        f.write(line+'\n')

# subword 학습을 위해 문장만 따로 저장
with open('data/train_tokenizer.txt', 'r', encoding='utf-8') as f:
    test_tokenizer = f.read().split('\n')
print(test_tokenizer[:3])

num_word_list = [len(sentence.split()) for sentence in test_tokenizer]
print('\n코퍼스 문장수/평균/총 단어 갯수 : %d, %.1f / %d' % (len(num_word_list), sum(num_word_list)/len(num_word_list), sum(num_word_list)))

data loading done!
문장: ['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '너무재밓었다그래서보는것을추천한다']
라벨: [0, 1, 0]
['아 더빙.. 진짜 짜증나네요 목소리', '흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '너무재밓었다그래서보는것을추천한다']

코퍼스 문장수/평균/총 단어 갯수 : 149996, 7.6 / 1137736
Wall time: 19.3 s


# SentencePiece 학습

In [2]:
%%time
import sentencepiece as spm

input_file = 'data/train_tokenizer.txt'
vocab_size = 32000
model_name = 'model_sentencepiece/sentencepiece_tokenizer_kor_%d' % (vocab_size)
model_type = 'bpe'
character_coverage  = 1.0  # 0.9995
user_defined_symbols = '[PAD],[UNK],[CLS],[SEP],[MASK],[BOS],[EOS],[UNK0],[UNK1],[UNK2],[UNK3],[UNK4],[UNK5],[UNK6],[UNK7],[UNK8],[UNK9],[unused0],[unused1],[unused2],[unused3],[unused4],[unused5],[unused6],[unused7],[unused8],[unused9],[unused10],[unused11],[unused12],[unused13],[unused14],[unused15],[unused16],[unused17],[unused18],[unused19],[unused20],[unused21],[unused22],[unused23],[unused24],[unused25],[unused26],[unused27],[unused28],[unused29],[unused30],[unused31],[unused32],[unused33],[unused34],[unused35],[unused36],[unused37],[unused38],[unused39],[unused40],[unused41],[unused42],[unused43],[unused44],[unused45],[unused46],[unused47],[unused48],[unused49],[unused50],[unused51],[unused52],[unused53],[unused54],[unused55],[unused56],[unused57],[unused58],[unused59],[unused60],[unused61],[unused62],[unused63],[unused64],[unused65],[unused66],[unused67],[unused68],[unused69],[unused70],[unused71],[unused72],[unused73],[unused74],[unused75],[unused76],[unused77],[unused78],[unused79],[unused80],[unused81],[unused82],[unused83],[unused84],[unused85],[unused86],[unused87],[unused88],[unused89],[unused90],[unused91],[unused92],[unused93],[unused94],[unused95],[unused96],[unused97],[unused98],[unused99]'

input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --user_defined_symbols=%s --model_type=%s --character_coverage=%s'
cmd = input_argument%(input_file, model_name, vocab_size,user_defined_symbols, model_type, character_coverage)

spm.SentencePieceTrainer.Train(cmd)
print('train done')

train done
Wall time: 47.1 s


In [3]:
## check
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format(model_name))
sentencepiece_tokenizer = sp.encode
token = sentencepiece_tokenizer('나는 오늘 아침밥을 먹었다.', out_type=str)

print(token)

['▁나는', '▁오늘', '▁아침', '밥', '을', '▁먹', '었다', '.']


# Huggingface tokenizer

## 1. Huggingface setup

## 2. Huggingface train

In [4]:
%%time
from tokenizers import BertWordPieceTokenizer

corpus_file   = 'data/train_tokenizer.txt'
vocab_size    = 32000
limit_alphabet= 6000
output_path   = 'model_hugging_face/hugging_%d'%(vocab_size)
min_frequency = 5

## WordPiece 모델 선언
tokenizer = BertWordPieceTokenizer(
    vocab_file=None,
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False,  # Must be False if cased model
    lowercase=False,
    wordpieces_prefix="##")

## 모델 학습
tokenizer.train(
    files=[corpus_file],
    limit_alphabet=limit_alphabet,
    vocab_size=vocab_size,
    min_frequency = min_frequency,  # 단어의 최소 발생 빈도, 3
    show_progress = True,
    special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]',     
                    '[BOS]', '[EOS]', '[UNK0]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]', '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]',
                    '[unused0]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]',
                    '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]',
                    '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]',
                    '[unused30]', '[unused31]', '[unused32]', '[unused33]', '[unused34]', '[unused35]', '[unused36]', '[unused37]', '[unused38]', '[unused39]',
                    '[unused40]', '[unused41]', '[unused42]', '[unused43]', '[unused44]', '[unused45]', '[unused46]', '[unused47]', '[unused48]', '[unused49]',
                    '[unused50]', '[unused51]', '[unused52]', '[unused53]', '[unused54]', '[unused55]', '[unused56]', '[unused57]', '[unused58]', '[unused59]',
                    '[unused60]', '[unused61]', '[unused62]', '[unused63]', '[unused64]', '[unused65]', '[unused66]', '[unused67]', '[unused68]', '[unused69]',
                    '[unused70]', '[unused71]', '[unused72]', '[unused73]', '[unused74]', '[unused75]', '[unused76]', '[unused77]', '[unused78]', '[unused79]',
                    '[unused80]', '[unused81]', '[unused82]', '[unused83]', '[unused84]', '[unused85]', '[unused86]', '[unused87]', '[unused88]', '[unused89]',
                    '[unused90]', '[unused91]', '[unused92]', '[unused93]', '[unused94]', '[unused95]', '[unused96]', '[unused97]', '[unused98]', '[unused99]'
                   ],  # 스페셜 토큰
)
## tokenizer, vocab 저장
tokenizer.save_model('.', output_path)

Wall time: 7.16 s


## 3. Huggingface Tokenize test

In [5]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(
    vocab_file=output_path+'-vocab.txt',
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False, # Cased 모델 시 False
    lowercase=False,
    wordpieces_prefix="##"
)

output = tokenizer.encode("나는 오늘 아침밥을 먹었다.")
print('idx   : %s'%output.ids)
print('tokens: %s'%output.tokens)
print('offset: %s'%output.offsets)

output = tokenizer.encode("교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정")
print('idx   : %s'%output.ids)
print('tokens: %s'%output.tokens)
print('offset: %s'%output.offsets)

idx   : [2, 5951, 6105, 7623, 3482, 3216, 1472, 5595, 130, 3]
tokens: ['[CLS]', '나는', '오늘', '아침', '##밥', '##을', '먹', '##었다', '.', '[SEP]']
offset: [(0, 0), (0, 2), (3, 5), (6, 8), (8, 9), (9, 10), (11, 12), (12, 14), (14, 15), (0, 0)]
idx   : [2, 25910, 5691, 12664, 130, 130, 5821, 6481, 5659, 130, 130, 5577, 13610, 3]
tokens: ['[CLS]', '교도소', '이야기', '##구먼', '.', '.', '솔직히', '재미는', '없다', '.', '.', '평점', '조정', '[SEP]']
offset: [(0, 0), (0, 3), (4, 7), (7, 9), (10, 11), (11, 12), (12, 15), (16, 19), (20, 22), (22, 23), (23, 24), (24, 26), (27, 29), (0, 0)]


# Tokenze usage

## 1. SentencePiece Usage, load & 분절

In [7]:
%%time
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('model_sentencepiece/sentencepiece_tokenizer_kor_32000'))
sentencepiece_tokenizer = sp.encode

result_tokenized_sentencepiece= sentencepiece_tokenizer(test_tokenizer, out_type=str)
for tmp in result_tokenized_sentencepiece[:3]:
    print(tmp)

['▁아', '▁더빙', '..', '▁진짜', '▁짜증나네요', '▁목소리']
['▁흠', '...', '포스터', '보고', '▁초딩영화', '줄', '....', '오버', '연기', '조차', '▁가볍지', '▁않', '구나']
['▁너무', '재', '밓', '었다', '그래서', '보는', '것을', '추천', '한다']
Wall time: 4.46 s


## 2. Huggingface Usage, load & 분절

In [9]:
%%time
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format('model_sentencepiece/sentencepiece_tokenizer_kor_32000'))
sentencepiece_tokenizer = sp.encode

result_tokenized_sentencepiece= sentencepiece_tokenizer(test_tokenizer, out_type=str)
for tmp in result_tokenized_sentencepiece[:3]:
    print(tmp)

['▁아', '▁더빙', '..', '▁진짜', '▁짜증나네요', '▁목소리']
['▁흠', '...', '포스터', '보고', '▁초딩영화', '줄', '....', '오버', '연기', '조차', '▁가볍지', '▁않', '구나']
['▁너무', '재', '밓', '었다', '그래서', '보는', '것을', '추천', '한다']
Wall time: 4.16 s
