In [1]:
import re, collections

In [12]:
dictionary = {'l o w </w>' : 5, 'l o w e r </w>' : 2, 'n e w e s t </w>':6,
              'w i d e s t </w>':3}

# 가장 빈도수가 높은 유니그램의 쌍을 하나의 유니그램으로 통합하는 과정
def get_stats(dictionary):
    pairs = collections.defaultdict(int)
    for word, freq in dictionary.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[symbols[i], symbols[i+1]] += freq
    #print('current pair frequency:', dict(pairs))
    return pairs

def merge_dictionary(pair, v_in):
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

bpe_codes = {}
bpe_codes_reverse = {}

num_merges = 10 # bpe 수행 횟수
for i in range(num_merges):
    
    pairs = get_stats(dictionary)
    best = max(pairs, key=pairs.get)
    dictionary = merge_dictionary(best, dictionary)

    bpe_codes[best] = i
    bpe_codes_reverse[best[0] + best[1]] = best
    if i == num_merges-1:
        print("### Iteration {}".format(i + 1))
        print("new merge: {}".format(best))
        print("dictionary: {}".format(dictionary))

### Iteration 10
new merge: ('w', 'i')
dictionary: {'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'wi d est</w>': 3}


In [13]:
print(bpe_codes)

{('e', 's'): 0, ('es', 't'): 1, ('est', '</w>'): 2, ('l', 'o'): 3, ('lo', 'w'): 4, ('n', 'e'): 5, ('ne', 'w'): 6, ('new', 'est</w>'): 7, ('low', '</w>'): 8, ('w', 'i'): 9}


In [16]:
# oov 대처하기
def get_pairs(word):
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

def encode(orig):
    word = tuple(orig) + ('</w>', )
    print(f'word split into characters:{word}')
    pairs = get_pairs(word)
    if not pairs:
        return orig
    
    iteration = 0
    while True:
        iteration += 1
        print(f'Iteration {iteration}: ')
        print(f'bigrams in the word: {pairs}')
        bigram = min(pairs, key = lambda pair: bpe_codes.get(pair, float('inf')))
        print(f'candidate for merging: {bigram}')
        if bigram not in bpe_codes:
            print('Candidate not in BPE merges, algorithm stops.')
            break
        first, second = bigram
        new_word = []
        i = 0
        while i < len(word):
            try:
                j = word.index(first, i)
                new_word.extend(word[i:j])
                i = j
            except:
                new_word.extend(word[i:])
                break
            
            if word[i] == first and i < len(word)-1 and word[i+1] == second:
                new_word.append(first+second)
                i += 2
            else:
                new_word.append(word[i])
                i += 1
        
        new_word = tuple(new_word)
        word = new_word
        print(f'word after merging : {word}')
        if len(word) == 1:
            break
        else:
            pairs = get_pairs(word)
    
    if word[-1] == '</w>':
        word = word[:-1]
    elif word[-1].endswith('</w>'):
        word = word[:-1] + (word[-1].replace('</w>', ''), )
    
    return word

In [21]:
encode('lowing')

word split into characters:('l', 'o', 'w', 'i', 'n', 'g', '</w>')
Iteration 1: 
bigrams in the word: {('l', 'o'), ('g', '</w>'), ('o', 'w'), ('w', 'i'), ('i', 'n'), ('n', 'g')}
candidate for merging: ('l', 'o')
word after merging : ('lo', 'w', 'i', 'n', 'g', '</w>')
Iteration 2: 
bigrams in the word: {('g', '</w>'), ('w', 'i'), ('lo', 'w'), ('i', 'n'), ('n', 'g')}
candidate for merging: ('lo', 'w')
word after merging : ('low', 'i', 'n', 'g', '</w>')
Iteration 3: 
bigrams in the word: {('g', '</w>'), ('n', 'g'), ('low', 'i'), ('i', 'n')}
candidate for merging: ('g', '</w>')
Candidate not in BPE merges, algorithm stops.


('low', 'i', 'n', 'g')

In [22]:
import sentencepiece as spm
import pandas as pd
import urllib.request
import csv

urllib.request.urlretrieve("https://raw.githubusercontent.com/LawrenceDuan/IMDb-Review-Analysis/master/IMDb_Reviews.csv", filename="IMDb_Reviews.csv")

train_df = pd.read_csv('IMDb_Reviews.csv')
train_df['review']

0        My family and I normally do not watch local mo...
1        Believe it or not, this was at one time the wo...
2        After some internet surfing, I found the "Home...
3        One of the most unheralded great works of anim...
4        It was the Sixties, and anyone with long hair ...
                               ...                        
49995    the people who came up with this are SICK AND ...
49996    The script is so so laughable... this in turn,...
49997    "So there's this bride, you see, and she gets ...
49998    Your mind will not be satisfied by this nobud...
49999    The chaser's war on everything is a weekly sho...
Name: review, Length: 50000, dtype: object

In [23]:
with open('imdb_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(train_df['review']))

In [24]:
# input : 학습시킬 파일
# model_prefix : 만들어질 모델 이름
# vocab_size : 단어 집합의 크기
# model_type : 사용할 모델 (unigram(default), bpe, char, word)
# max_sentence_length: 문장의 최대 길이
# pad_id, pad_piece: pad token id, 값
# unk_id, unk_piece: unknown token id, 값
# bos_id, bos_piece: begin of sentence token id, 값
# eos_id, eos_piece: end of sequence token id, 값
# user_defined_symbols: 사용자 정의 토큰

spm.SentencePieceTrainer.Train('--input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=imdb_review.txt --model_prefix=imdb --vocab_size=5000 --model_type=bpe --max_sentence_length=9999
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: imdb_review.txt
  input_format: 
  model_prefix: imdb
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 9999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  u

ll=68586 active=3709 piece=▁sat
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2324 size=1640 all=69187 active=4310 piece=ern
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2291 size=1660 all=69602 active=4725 piece=▁cult
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2256 size=1680 all=69899 active=5022 piece=▁fig
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2224 size=1700 all=70385 active=5508 piece=▁cost
bpe_model_trainer.cc(171) LOG(INFO) Updating active symbols. max_freq=2224 min_freq=543
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2192 size=1720 all=70846 active=3967 piece=▁happens
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2151 size=1740 all=71385 active=4506 piece=▁fore
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2108 size=1760 all=71863 active=4984 piece=gs
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2077 size=1780 all=72390 active=5511 piece=ering
bpe_model_trainer.cc(280) LOG(INFO) Added: freq=2047 size=1800 all=72864 active=5985 piece=▁cheap
bpe_model

In [39]:
vocab_list = pd.read_csv('imdb.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
vocab_list.sample(3)

Unnamed: 0,0,1
1901,▁stay,-1898
78,se,-75
3,▁t,0


In [40]:
sp = spm.SentencePieceProcessor()
vocab_file = 'imdb.model'
sp.load(vocab_file)

True

In [41]:
lines = ["I didn't at all think of it this way.", 
         "I have waited a long time for someone to film"]
for line in lines:
    print(line)
    print(sp.encode_as_pieces(line))
    print(sp.encode_as_ids(line))
    print()

I didn't at all think of it this way.
['▁I', '▁didn', "'", 't', '▁at', '▁all', '▁think', '▁of', '▁it', '▁this', '▁way', '.']
[41, 623, 4950, 4926, 138, 169, 378, 30, 58, 73, 413, 4945]

I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]



In [48]:
print(sp.GetPieceSize())
print(sp.IdToPiece(430))
print(sp.PieceToId('▁character'))
print(sp.DecodeIds([41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]))
print(sp.DecodePieces(
    ['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']))
print(sp.encode('I have waited a long time for someone to film', out_type=str))
print(sp.encode('I have waited a long time for someone to film', out_type=int))

5000
▁character
430
I have waited a long time for someone to film
I have waited a long time for someone to film
['▁I', '▁have', '▁wa', 'ited', '▁a', '▁long', '▁time', '▁for', '▁someone', '▁to', '▁film']
[41, 141, 1364, 1120, 4, 666, 285, 92, 1078, 33, 91]


In [2]:
import pandas as pd
import sentencepiece as spm
import urllib.request
import csv

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")
naver_df = pd.read_table('ratings.txt')
naver_df[:5]

Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


In [3]:
print(naver_df.isnull().values.any())

True


In [4]:
naver_df = naver_df.dropna(how='any')
print(naver_df.isnull().values.any())

False


In [5]:
with open('naver_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(naver_df['document']))

In [6]:
spm.SentencePieceTrainer.Train('--input=naver_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999')

sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=naver_review.txt --model_prefix=naver --vocab_size=5000 --model_type=bpe --max_sentence_length=9999
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: naver_review.txt
  input_format: 
  model_prefix: naver
  model_type: BPE
  vocab_size: 5000
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 9999
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1

In [9]:
vocab_list = pd.read_csv('naver.vocab', sep='\t', header=None, quoting=csv.QUOTE_NONE)
print(vocab_list[:5])
print(vocab_list.sample(3))

       0  1
0  <unk>  0
1    <s>  0
2   </s>  0
3     ..  0
4     영화 -1
        0     1
3098  감독이 -3095
3822    틀 -3819
3637    왔 -3634


In [10]:
sp = spm.SentencePieceProcessor()
vocab_file = "naver.model"
sp.load(vocab_file)

True

In [11]:
lines = ["뭐 이딴 것도 영화냐.", "진짜 최고의 영화입니다 ㅋㅋ",]
for line in lines:
  print(line)
  print(sp.encode_as_pieces(line))
  print(sp.encode_as_ids(line))
  print()

뭐 이딴 것도 영화냐.
['▁뭐', '▁이딴', '▁것도', '▁영화냐', '.']
[132, 966, 1296, 2590, 3276]

진짜 최고의 영화입니다 ㅋㅋ
['▁진짜', '▁최고의', '▁영화입니다', '▁ᄏᄏ']
[54, 200, 821, 85]



In [12]:
import pandas as pd
import urllib.request
from tokenizers import BertWordPieceTokenizer

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt", filename="ratings.txt")

('ratings.txt', <http.client.HTTPMessage at 0x7f9a9856d4c0>)

In [13]:
naver_df = pd.read_table('ratings.txt')
naver_df = naver_df.dropna(how='any')
with open('naver_review.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(naver_df['document']))

In [15]:
tokenizer = BertWordPieceTokenizer(lowercase=False, strip_accents=False)

In [16]:
data_file = 'naver_review.txt'
vocab_size = 30000
limit_alphabet = 6000
min_frequency = 5

tokenizer.train(
    files=data_file, vocab_size=vocab_size, 
    limit_alphabet=limit_alphabet, # 병합 전의 초기 토큰의 허용 개수
    min_frequency=min_frequency)






In [17]:
tokenizer.save_model('./')

['./vocab.txt']

In [18]:
df = pd.read_fwf('vocab.txt', header=None)
df

Unnamed: 0,0
0,[PAD]
1,[UNK]
2,[CLS]
3,[SEP]
4,[MASK]
...,...
29995,말라는
29996,말밖에는
29997,맘을
29998,맛도


In [20]:
encoded = tokenizer.encode('일정한 간격 고정된 구조의 외부데이터 불러오기')
print(f'tokenizer result:{encoded.tokens}')
print(f'encode : {encoded.ids}')
print(f'decoding : {tokenizer.decode(encoded.ids)}')

encoded = tokenizer.encode('커피 한잔의 여유를 즐기다')
print(f'tokenizer result:{encoded.tokens}')
print(f'encode : {encoded.ids}')
print(f'decoding : {tokenizer.decode(encoded.ids)}')

tokenizer result:['일', '##정한', '간', '##격', '고정', '##된', '구조', '##의', '외', '##부', '##데이', '##터', '불러', '##오', '##기']
encode : [2344, 11323, 568, 3618, 15453, 3409, 12266, 3251, 2252, 3249, 9920, 3549, 8625, 3427, 3447]
decoding : 일정한 간격 고정된 구조의 외부데이터 불러오기
tokenizer result:['커피', '한잔', '##의', '여유', '##를', '즐기', '##다']
encode : [12825, 25644, 3251, 12696, 3239, 10784, 3290]
decoding : 커피 한잔의 여유를 즐기다


In [21]:
from tokenizers import ByteLevelBPETokenizer, CharBPETokenizer, SentencePieceBPETokenizer

tokenizer = SentencePieceBPETokenizer()
tokenizer.train('naver_review.txt', vocab_size=10000, min_frequency=5)

encoded = tokenizer.encode('이 영화는 정말 재미있습니다.')
print(encoded.tokens)




['▁이', '▁영화는', '▁정말', '▁재미있', '습니다.']
