# Word Piece Tokenizer

<br> https://lovit.github.io/nlp/2018/04/02/wpm/
<br> https://wikidocs.net/22592

In [37]:
import pandas as pd # for dataframe
import numpy as np # for numpy data structure

import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression # baseline model for classification
from sklearn.model_selection import train_test_split

In [38]:
# check directory
os.listdir('./data/')

['.DS_Store',
 '3_class_naver_news_200819_test.csv',
 '3_class_naver_news_200819_train.csv']

In [39]:
train = pd.read_csv('./data/3_class_naver_news_200819_train.csv', header=None)
test = pd.read_csv('./data/3_class_naver_news_200819_test.csv', header=None)
train.head(3)

Unnamed: 0,0,1,2,3,4,5,6
0,0,20170108,IT_science,데일리안,‘효자폰’ 갤럭시S7 ‘불효폰’ G5…분위기 극과극,삼성전자 ‘갤럭시S7’ 왼쪽 LG전자 ‘G5’ 오른쪽 모델 이미지 ⓒ각 사 삼성전자...,https://news.naver.com/main/read.nhn?mode=LSD&...
1,1,20170111,IT_science,디지털타임스,카카오 O2O 플랫폼 사업 윤곽도 못잡아,11월초 공개 O2O 위드 카카오 두달 넘게 모습 드러내지 않아 후발주자로 출시까지...,https://news.naver.com/main/read.nhn?mode=LSD&...
2,2,20170103,economy,조세일보,동원 고위도高緯度 북대서양 참다랑어 어획 성공,동원산업 대표이사 이명우 은 고위도 高緯度 북대서양 참다랑어를 어획하는데 성공했다고...,https://news.naver.com/main/read.nhn?mode=LSD&...


In [None]:
print(train.shape)
print()
print(train.info())

In [40]:
colnames = ['index', 'date', 'category', 'media', 'title', 'contents', 'link']
train.columns = colnames
test.columns = colnames

## 1-1. Wordpiece Tokenization example
- Word Piece를 활용한 토큰화

In [7]:
import re, collections
from IPython.display import display, Markdown, Latex

def get_stats(vocab):
    pairs = collections.defaultdict(int) # int를 value로 하는 dictionary 자료 구조 생성
    for word, freq in vocab.items():
        symbols = word.split() # [l, o, w]
        
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq # key로 word pair tuple을 받아서 갯수 카운트
            
    return pairs

def merge_vocab(pair, v_in):
    '''
    ARGS:
        pair -> subword pairs which were exposed most frequently
        v_in -> vocab
    '''
    v_out = {}
    
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    
    for word in v_in:
        w_out = p.sub(''.join(pair), word) # ['n','e','w','es','t']
        v_out[w_out] = v_in[word]
    return v_out

# 예시 vocab 사전
vocab = {
        'l o w </w>' : 5,
         'l o w e r </w>' : 2,
         'n e w e s t </w>': 6,
         'w i d e s t </w>': 3
         }

bpe_codes = {}
bpe_codes_reverse = {}

num_merges = 10

for i in range(num_merges):
#     display(Markdown("### Iteration {}".format(i + 1)))
    pairs = get_stats(vocab)
    best = max(pairs, key = pairs.get)
    vocab = merge_vocab(best, vocab)

    bpe_codes[best] = i
    bpe_codes_reverse[best[0] + best[1]] = best

    print("new merge: {}".format(best))
    print("dictionary: {}".format(vocab))

### Iteration 1

new merge: ('e', 's')
dictionary: {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w es t </w>': 6, 'w i d es t </w>': 3}


### Iteration 2

new merge: ('es', 't')
dictionary: {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est </w>': 6, 'w i d est </w>': 3}


### Iteration 3

new merge: ('est', '</w>')
dictionary: {'l o w </w>': 5, 'l o w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}


### Iteration 4

new merge: ('l', 'o')
dictionary: {'lo w </w>': 5, 'lo w e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}


### Iteration 5

new merge: ('lo', 'w')
dictionary: {'low </w>': 5, 'low e r </w>': 2, 'n e w est</w>': 6, 'w i d est</w>': 3}


### Iteration 6

new merge: ('n', 'e')
dictionary: {'low </w>': 5, 'low e r </w>': 2, 'ne w est</w>': 6, 'w i d est</w>': 3}


### Iteration 7

new merge: ('ne', 'w')
dictionary: {'low </w>': 5, 'low e r </w>': 2, 'new est</w>': 6, 'w i d est</w>': 3}


### Iteration 8

new merge: ('new', 'est</w>')
dictionary: {'low </w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}


### Iteration 9

new merge: ('low', '</w>')
dictionary: {'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'w i d est</w>': 3}


### Iteration 10

new merge: ('w', 'i')
dictionary: {'low</w>': 5, 'low e r </w>': 2, 'newest</w>': 6, 'wi d est</w>': 3}


In [8]:
bpe_codes

{('e', 's'): 0,
 ('es', 't'): 1,
 ('est', '</w>'): 2,
 ('l', 'o'): 3,
 ('lo', 'w'): 4,
 ('n', 'e'): 5,
 ('ne', 'w'): 6,
 ('new', 'est</w>'): 7,
 ('low', '</w>'): 8,
 ('w', 'i'): 9}

## 1-2. Mecab-based word extraction to build pre-trained dict
<br> 10,000개의 train data를 활용해서 n-gram dict를 만듭니다

In [16]:
from konlpy.tag import Mecab
from collections import defaultdict

mecab = Mecab()

def to_ngrams(words, n):
    ngrams = []
    for idx in range(len(words) - n + 1):
        ngrams.append(tuple(words[idx:idx+n]))
    return ngrams

def ngram_extractor(docs:list, min_count=5, n_range=(1,2)):
    
    '''
        input
        docs -> list whose elements are strings
        min_count -> minimum threshold for dictionary, default 5.
        n_range -> range of n-gram, default is for unigram and bigram.
        
        output
        ngram_counter -> counting dictionary for n-gram
    '''

    ngram_counter = defaultdict(int) # count를 위한 dictionary를 만들기 위해 defaultdict 사용
    begin, end = n_range
    
    for idx in range(len(docs)):
        
        if idx % 1000 == 0:
            print(f'processing {idx}th document...')
        
        unigrams = mecab.pos(docs[idx], join=True)
        for n in range(begin, end + 1):
            for ngram in to_ngrams(unigrams, n):
                ngram_counter[ngram] += 1

    # set min_count for the size of dict
    ngram_counter = {
        ngram:count for ngram, count in ngram_counter.items()
        if count >= min_count
    }
    
    return ngram_counter

# 뉴스의 본문 token을 활용한 tokenizer 만들기
ngram_dic = ngram_extractor(train['contents'])

processing 0th document...
processing 1000th document...
processing 2000th document...
processing 3000th document...
processing 4000th document...
processing 5000th document...
processing 6000th document...
processing 7000th document...
processing 8000th document...
processing 9000th document...


In [17]:
# check built vocabs
sorted(ngram_dic.items(), key=lambda x:-x[1])[:20]
# 조사, 동사가 대부분. 이들은 정보성이 부족.

[(('./SF',), 113157),
 (('을/JKO',), 87622),
 (('다/EF',), 79442),
 (('다/EF', './SF'), 76198),
 (('에/JKB',), 60266),
 (('의/JKG',), 59262),
 (('를/JKO',), 57213),
 (('이/JKS',), 54208),
 (('는/JX',), 44613),
 (('하/XSV',), 43819),
 (('는/ETM',), 42304),
 (('고/EC',), 42134),
 (('은/JX',), 40539),
 (('가/JKS',), 34990),
 (('으로/JKB',), 32197),
 (('이/VCP',), 28398),
 (('에서/JKB',), 27844),
 (('했/XSV+EP',), 27148),
 (('로/JKB',), 22985),
 (('도/JX',), 22290)]

In [18]:
# 주어진 vocab dict sort해서 보기
for ngram, count in sorted(ngram_dic.items(), key=lambda x:-x[1]):
    if ngram[-1] == '기술/NNG':
        print(ngram, count)

('기술/NNG',) 5112
('과학/NNG', '기술/NNG') 272
('정보/NNG', '기술/NNG') 221
('통신/NNG', '기술/NNG') 219
('는/ETM', '기술/NNG') 174
('신/XPN', '기술/NNG') 156
('의/JKG', '기술/NNG') 131
('AI/SL', '기술/NNG') 107
('첨단/NNG', '기술/NNG') 90
('한/XSA+ETM', '기술/NNG') 87
('핵심/NNG', '기술/NNG') 86
('혁신/NNG', '기술/NNG') 71
('주행/NNG', '기술/NNG') 63
('인공지능/NNP', '기술/NNG') 62
('원천/NNG', '기술/NNG') 58
('인식/NNG', '기술/NNG') 54
('관련/NNG', '기술/NNG') 53
('산업/NNG', '기술/NNG') 52
('미래/NNG', '기술/NNG') 51
('닷/NR', '기술/NNG') 50
('는/JX', '기술/NNG') 49
('새로운/VA+ETM', '기술/NNG') 46
('IT/SL', '기술/NNG') 46
('과/JC', '기술/NNG') 44
('./SF', '기술/NNG') 43
('이/MM', '기술/NNG') 41
('한/XSV+ETM', '기술/NNG') 40
('인/VCP+ETM', '기술/NNG') 39
('러닝/NNG', '기술/NNG') 38
('최고/NNG', '기술/NNG') 37
('소비자/NNG', '기술/NNG') 36
('IoT/SL', '기술/NNG') 34
('고/EC', '기술/NNG') 32
('은/JX', '기술/NNG') 32
('최신/NNG', '기술/NNG') 31
('등/NNB', '기술/NNG') 31
('된/XSV+ETM', '기술/NNG') 29
('셀/VV+ETM', '기술/NNG') 29
('VR/SL', '기술/NNG') 29
('융합/NNG', '기술/NNG') 28
('ICT/SL', '기술/NNG') 28
('은/ETM', '기술/NN

## 2-1. TF-IDF Vectorizer code implementation

- 주어진 문서들의 토큰을 column으로, 문서를 row로 하는 TFIDF-based sparse matrix 직접 만들어보기

In [19]:
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
]

In [20]:
vocab = list(set(token for doc in docs for token in mecab.morphs(doc)))
vocab.sort()

In [21]:
N = len(docs) # N= 4  

def tf(sent:str, token:str):
    return mecab.morphs(sent).count(token)

# token -> vocab dictionary 단어들 중 하나
def idf(token:str):
    global N, docs
    
    cnt = 0
    for doc in docs:
        if token in mecab.morphs(doc):
            cnt += 1
            
    return np.log(N / (cnt + 1))

def tfidf(sent:str, token:str):
    return tf(sent, token) * idf(token)

In [22]:
# [[0.0, 0.0, 0.0, 0.0, 0.287, 0.0, ... 0.0], [0.0. 0.0 0.0], [], []]

result = []
for i in range(N):
    result.append([])
    for j in range(len(vocab)):
        result[-1].append(tfidf(docs[i], vocab[j]))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,고,과일,길,노란,는,먹,바나나,사과,싶,아요,은,이,저,좋
0,0.0,0.0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.287682,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.287682,0.0,0.0,0.0
2,0.0,0.0,0.693147,0.693147,0.0,0.0,0.575364,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.693147,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.693147,0.0,0.693147,0.693147,0.693147


## 2-2. Use PMI score for extracting/filtering N-grams

In [None]:
def get_ngram_score(ngram_counter, delta=30):
    ngrams_ = {}
    for ngram, count in ngram_counter.items():
        
        if len(ngram) == 1:
            continue
            
        first = ngram_counter[ngram[:-1]]
        second = ngram_counter[ngram[1:]]
        score = (count - delta) / (first * second) # consider the role of delta
        
        if score > 0: # pmi score가 양수인 경우에만 추가
            ngrams_[ngram] = (count, score)
            
    return ngrams_

# 위에서 받은 ngram_dic을 인자로 전달해서, pmi score 확인하기
ngram_scores = get_ngram_score(ngram_dic)

In [None]:
ngram_scores

## 3. Build N-gram tokenizer
<br> tokenize documents using pre-trained Ngram tokenizer

In [29]:
from collections import defaultdict

class Ngram_tokenizer_MeCab():
    '''
        input
        vocab_dic -> pretrained n-gram vocab dictionary
        n_range -> range for n in n-gram
    '''
    def __init__(self, vocab_dic, n_range=(1,2)):
        self.vocab_dic = vocab_dic
        self.begin, self.end = n_range
        
    def __call__(self, sent):
        return self.tokenize(sent)

    def tokenize(self, sent):
        '''
            input
            sent -> a string or document, which is to be tokenized
            
            output
            ngrams -> tokenized result for given string or document
        '''
        if not sent:
            return []
        
        unigrams = mecab.pos(sent, join=True)
        
        ngrams = []
        for n in range(self.begin, self.end + 1):
            for ngram in self._to_ngrams(unigrams, n):
                ngrams.append('-'.join(ngram)) # to make it clear
        return ngrams

    def _to_ngrams(self, tokens, n): # returns n-gram for given window size n
        ngrams = []
        for idx in range(len(tokens) - n + 1):
            ngram = tuple(tokens[idx:idx+n])
            if ngram in self.vocab_dic:
                ngrams.append(ngram)
                
        return ngrams

In [24]:
tokenizer = Ngram_tokenizer_MeCab(vocab_dic = ngram_dic)

In [25]:
# test case for unigram and bigram
tokenizer.tokenize('학교에서 수업을 받는다.')

['학교/NNG',
 '에서/JKB',
 '수업/NNG',
 '을/JKO',
 '받/VV',
 '는다/EF',
 './SF',
 '학교/NNG-에서/JKB',
 '수업/NNG-을/JKO',
 '을/JKO-받/VV',
 '받/VV-는다/EF',
 '는다/EF-./SF']

### add new column for title tokenizing

In [26]:
train.head(3)

Unnamed: 0,index,date,category,media,title,contents,link
0,0,20170108,IT_science,데일리안,‘효자폰’ 갤럭시S7 ‘불효폰’ G5…분위기 극과극,삼성전자 ‘갤럭시S7’ 왼쪽 LG전자 ‘G5’ 오른쪽 모델 이미지 ⓒ각 사 삼성전자...,https://news.naver.com/main/read.nhn?mode=LSD&...
1,1,20170111,IT_science,디지털타임스,카카오 O2O 플랫폼 사업 윤곽도 못잡아,11월초 공개 O2O 위드 카카오 두달 넘게 모습 드러내지 않아 후발주자로 출시까지...,https://news.naver.com/main/read.nhn?mode=LSD&...
2,2,20170103,economy,조세일보,동원 고위도高緯度 북대서양 참다랑어 어획 성공,동원산업 대표이사 이명우 은 고위도 高緯度 북대서양 참다랑어를 어획하는데 성공했다고...,https://news.naver.com/main/read.nhn?mode=LSD&...


In [27]:
train['title_token'] = train['title'].apply(lambda sent: tokenizer.tokenize(sent))
train.head(3)

Unnamed: 0,index,date,category,media,title,contents,link,title_token
0,0,20170108,IT_science,데일리안,‘효자폰’ 갤럭시S7 ‘불효폰’ G5…분위기 극과극,삼성전자 ‘갤럭시S7’ 왼쪽 LG전자 ‘G5’ 오른쪽 모델 이미지 ⓒ각 사 삼성전자...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[‘/SY, 효자/NNG, 폰/NNG, ’/SY, 갤럭시/NNP, S/SL, 7/S..."
1,1,20170111,IT_science,디지털타임스,카카오 O2O 플랫폼 사업 윤곽도 못잡아,11월초 공개 O2O 위드 카카오 두달 넘게 모습 드러내지 않아 후발주자로 출시까지...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[카카오/NNP, O/SL, 2/SN, O/SL, 플랫/NNG, 폼/NNG, 사업/..."
2,2,20170103,economy,조세일보,동원 고위도高緯度 북대서양 참다랑어 어획 성공,동원산업 대표이사 이명우 은 고위도 高緯度 북대서양 참다랑어를 어획하는데 성공했다고...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[동원/NNG, 고위/NNG, 도/JX, 高/XPN, 북대서양/NNG, 참다랑어/N..."


In [28]:
print(train['title_token'][0])

['‘/SY', '효자/NNG', '폰/NNG', '’/SY', '갤럭시/NNP', 'S/SL', '7/SN', '‘/SY', '폰/NNG', '’/SY', 'G/SL', '5/SN', '…/SE', '분위기/NNG', '극/NNG', '과/JC', '극/NNG', '폰/NNG-’/SY', '갤럭시/NNP-S/SL', 'S/SL-7/SN', '폰/NNG-’/SY', 'G/SL-5/SN']


### 1. form DTM using scikit-learn

In [None]:
vectorizer = CountVectorizer(
    tokenizer = tokenizer,
    lowercase = False,
)
X_train = vectorizer.fit_transform(train['title'])
X_train.shape # (10000, 28962)

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer = tokenizer,
    lowercase = False,
)
X_train = vectorizer.fit_transform(train['title'])
X_train.shape # (10000, 28962)

### 2. Word Piece tokenization

In [None]:
# # 설치 필요
# !pip install sentencepiece

In [30]:
import sentencepiece as spm

In [31]:
input_file = 'spm_input.txt'

corpus = list(train['contents'])
with open(input_file, 'w', encoding='utf-8') as f:
    for sent in corpus:
        f.write('{}\n'.format(sent))

In [32]:
templates = '--input={} --model_prefix={} --vocab_size={}'

vocab_size = 10000
prefix = '3_class_naver_news'
cmd = templates.format(input_file, prefix, vocab_size)

spm.SentencePieceTrainer.Train(cmd)

In [33]:
sp = spm.SentencePieceProcessor()
sp.Load('{}.model'.format(prefix))

True

In [53]:
sp.EncodeAsPieces('넑')

['▁', '넑']

In [54]:
sp.EncodeAsIds('넑')

[4, 0]

In [36]:
with open('{}.vocab'.format(prefix), encoding='utf-8') as f:
    vocabs = [doc.strip() for doc in f]

print('num of vocabs = {}'.format(len(vocabs))) # 10000

num of vocabs = 10000


In [None]:
sp.

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer = tokenizer,
    lowercase = False,
)
X_train = vectorizer.fit_transform(train['title'])
X_train.shape # (10000, 28962)

### Logistic Regression and its coefficient

In [None]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()

y_train = train['category']
classifier.fit(X_train, y_train)

In [None]:
IT_idxs_coef = list(enumerate(classifier.coef_[0]))
IT_idxs = sorted(IT_idxs_coef, key=lambda x:-x[1])[:50]

eco_idxs_coef = list(enumerate(classifier.coef_[1]))
eco_idxs = sorted(eco_idxs_coef, key=lambda x:-x[1])[:50]

poli_idxs_coef = list(enumerate(classifier.coef_[2]))
poli_idxs = sorted(poli_idxs_coef, key=lambda x:-x[1])[:50]

vocab2idx = vectorizer.vocabulary_
idx2vocab = list(sorted(vocab2idx, key=lambda x:vocab2idx[x]))

In [None]:
IT_idxs = [(idx2vocab[idx], coef) for (idx, coef) in IT_idxs]
IT_idxs

In [None]:
eco_idxs = [(idx2vocab[idx], coef) for (idx, coef) in eco_idxs]
eco_idxs

In [None]:
poli_idxs = [(idx2vocab[idx], coef) for (idx, coef) in poli_idxs]
poli_idxs

### test

In [None]:
X_test = vectorizer.transform(test['title'])
y_pred = classifier.predict(X_test)

In [None]:
y_true = test['category']
from sklearn.metrics import f1_score
f1_score(y_true, y_pred, average = 'weighted')

In [None]:
# plot confusion matrix and see which category is well-classified
# what do we need to do more to enhance its performance?