# Hugging Face BPE Tokenizer
https://github.com/huggingface/tokenizers/tree/master/bindings/python

In [1]:
files = ["./data/정치.csv"]
modelname = "bpe_20200319"

## 학습용 파일 읽기

In [15]:
import pandas as pd
contents = []

for file in files :
    df = pd.read_csv(file)
    contents = contents + df["content"].tolist()

len(contents)

356435

---

## 1. Word Piece Model (WPM)

### 1) WPM 학습

In [10]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer()
tokenizer.train(files)
tokenizer.save("./model/wpm/", modelname)

['./model/wpm/bpe_20200319-vocab.txt']

In [35]:
from tokenizers import BertWordPieceTokenizer

# Load a BPE Model
vocab = "./model/wpm/{}-vocab.txt".format(modelname)
tokenizer = BertWordPieceTokenizer(vocab)

### 2) WPM 활용 토큰화

In [None]:
# Multiprocessing 으로 변경
import re
import numpy as np

def tokenizeBySplit(contents) :
    hangul = re.compile('[^ \u3131-\u3163\uac00-\ud7a3]+')

    tokenized_text = []
    cnt = 1

    for content in contents :
        try :
            if content == np.nan :
                continue 
                
            text = hangul.sub('', content)
            tokenized_text = tokenized_text + [w for w in tokenizer.encode(text).tokens if len(w) > 1]

            cnt += 1
            if cnt % 1000 == 0 : 
                print("{} / {}".format(cnt, len(contents)))
        except Exception as e:
            continue
            #print(str(e))
            #print(content)
            
    return tokenized_text

import multiprocessing

def tokenize(contents) :
    chunk_size = int(len(contents)/multiprocessing.cpu_count())
    li_split = [contents[i:i + chunk_size] for i in range(0, len(contents), chunk_size)]
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    ret = pool.map(tokenizeBySplit, li_split)
    
    tokenized_text = []
    for text in ret :
        print(text)
        tokenized_text = tokenized_text + [token.replace("##","") for token in text if (token not in ('[CLS]','[SEP]','[UNK]')) & (len(token.replace("##",""))>1)]
    
    return tokenized_text

tokenized_text = tokenize(contents)

1000 / 44554
1000 / 44554
1000 / 44554
1000 / 44554
1000 / 44554
1000 / 44554
1000 / 44554
1000 / 44554
2000 / 44554
2000 / 44554
2000 / 44554
2000 / 44554
2000 / 44554
2000 / 44554
2000 / 44554
2000 / 44554
3000 / 44554
3000 / 44554
3000 / 44554
3000 / 44554
3000 / 44554
3000 / 44554
3000 / 44554
3000 / 44554
4000 / 44554
4000 / 44554
4000 / 44554
4000 / 44554
4000 / 44554
4000 / 44554
4000 / 44554
4000 / 44554
5000 / 44554
5000 / 44554
5000 / 44554
5000 / 44554
5000 / 44554
5000 / 44554
5000 / 44554
5000 / 44554
6000 / 44554
6000 / 44554
6000 / 44554
6000 / 44554
6000 / 44554
6000 / 44554
7000 / 44554
6000 / 44554
6000 / 44554
7000 / 44554
7000 / 44554
7000 / 44554
8000 / 44554
7000 / 44554
7000 / 44554
7000 / 44554
7000 / 44554
8000 / 44554
8000 / 44554
9000 / 44554
8000 / 44554
8000 / 44554
8000 / 44554
9000 / 44554
8000 / 44554
8000 / 44554
9000 / 44554
10000 / 44554
9000 / 44554
10000 / 44554
9000 / 44554
10000 / 44554
9000 / 44554
9000 / 44554
9000 / 44554
11000 / 44554
10000 / 

### 3) 토큰 분포 확인

In [None]:
import nltk
from nltk import FreqDist
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns
%matplotlib inline

def calFreqDist(tokenized_text) :
    fdist=FreqDist(tokenized_text)
    rc('font', family='AppleGothic')
    fdist.plot(50)
    return fdist

fdist = calFreqDist(tokenized_text)

In [None]:
len(tokenized_text)

In [None]:
fdist.most_common(100)

In [None]:
from konlpy.tag import Mecab
import pandas as pd

def genVoab(fdist, name) :
    mecab = Mecab()
    vocab = []

    for w in fdist.most_common(10000) :
        pos = mecab.pos(w[0])
        if (pos[0][1][0] == "N") & (len(pos[0][0]) > 1) & (len(pos)>1) & (w[1]>100):
            m = str(pos)
            b = "".join([p[0] for p in pos])
            p = ",".join([p[1] for p in pos])
            f = w[1]
            #print(str(pos) + " " + str(w[1]) + " " + str(b))
            vocab.append((m,p,b,f))

    return pd.DataFrame(vocab, columns=['Mecab', 'PoS', name, 'Freq'])

df = genVoab(fdist, "WPM")
df.to_csv("./dict_wpm.csv")
df.head(50)        

---

## Sentence Piece Model (SPM)

In [9]:
from tokenizers import SentencePieceBPETokenizer
tokenizer = SentencePieceBPETokenizer()
tokenizer.train(files)
tokenizer.save("./model/spm/", modelname)

['./model/spm/bpe_20200319-vocab.json', './model/spm/bpe_20200319-merges.txt']

In [None]:
from tokenizers import SentencePieceBPETokenizer

# Load a BPE Model
vocab = "./model/spm/{}-vocab.json".format(modelname)
merges = "./model/spm/{}-merges.txt".format(modelname)
tokenizer = SentencePieceBPETokenizer(vocab, merges)

In [None]:
tokenized_text = tokenize(contents)

In [None]:
fdist = calFreqDist(tokenized_text)

In [None]:
df = genVoab(fdist, "SPM")
df.to_csv("./dict_spm.csv")
df.head(50)        

---

In [1]:
from konlpy.tag import Mecab
mecab = Mecab()
mecab.pos('자유한국당한국당예비후보때문에가능성국민들미래통합당필리버스터험지출마') # 사전등록후 Mecab

[('자유한국당', 'NNP'),
 ('한국당', 'NNP'),
 ('예비', 'NNG'),
 ('후보', 'NNG'),
 ('때문', 'NNB'),
 ('에', 'JKB'),
 ('가능', 'NNG'),
 ('성', 'XSN'),
 ('국민', 'NNG'),
 ('들', 'XSN'),
 ('미래통합당', 'NNP'),
 ('필리버스터', 'NNP'),
 ('험지출마', 'NNP')]

In [5]:
tokenizer.encode('자유한국당한국당예비후보때문에가능성국민들미래통합당필리버스터험지출마').tokens

['자유한국당',
 '한국당',
 '예비후보',
 '때문',
 '에',
 '가능성',
 '국민들',
 '미래통합당',
 '필리버스터',
 '험지',
 '출마</w>']