# N-gram and Vectorization
* part 1. Build N-gram based Tokenizer
<br> - unigram and bigram
<br> - N-gram and Tokenization


* part 2. DTM and Vectorization
<br> - term frequency matrix
<br> - text classification with Logistic Regression

<br> <참고> 
<br> https://lovit.github.io/nlp/2018/10/23/ngram/
<br> https://soohee410.github.io/compare_tagger
<br> https://lovit.github.io/nlp/2018/03/26/from_text_to_matrix/

In [2]:
import pandas as pd
import numpy as np

import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [9]:
# check directory
os.listdir('./data/')

['3_class_naver_news_200819_test.csv',
 '3_class_naver_news_200819_train.csv',
 '2016_filtered_review.txt']

In [96]:
train = pd.read_csv('./data/3_class_naver_news_200819_train.csv', header=None)
test = pd.read_csv('./data/3_class_naver_news_200819_test.csv', header=None)
train.head(3)

Unnamed: 0,0,1,2,3,4,5,6
0,0,20170108,IT_science,데일리안,‘효자폰’ 갤럭시S7 ‘불효폰’ G5…분위기 극과극,삼성전자 ‘갤럭시S7’ 왼쪽 LG전자 ‘G5’ 오른쪽 모델 이미지 ⓒ각 사 삼성전자...,https://news.naver.com/main/read.nhn?mode=LSD&...
1,1,20170111,IT_science,디지털타임스,카카오 O2O 플랫폼 사업 윤곽도 못잡아,11월초 공개 O2O 위드 카카오 두달 넘게 모습 드러내지 않아 후발주자로 출시까지...,https://news.naver.com/main/read.nhn?mode=LSD&...
2,2,20170103,economy,조세일보,동원 고위도高緯度 북대서양 참다랑어 어획 성공,동원산업 대표이사 이명우 은 고위도 高緯度 북대서양 참다랑어를 어획하는데 성공했다고...,https://news.naver.com/main/read.nhn?mode=LSD&...


In [84]:
print(train.shape)
print()
print(train.info())

(10000, 7)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       10000 non-null  int64 
 1   1       10000 non-null  int64 
 2   2       10000 non-null  object
 3   3       10000 non-null  object
 4   4       10000 non-null  object
 5   5       10000 non-null  object
 6   6       10000 non-null  object
dtypes: int64(2), object(5)
memory usage: 547.0+ KB
None


In [97]:
colnames = ['index', 'date', 'category', 'media', 'title', 'contents', 'link']
train.columns = colnames
test.columns = colnames

## MeCab-based tokenization example

In [150]:
# tokenization with Mecab
from konlpy.tag import Mecab
mecab = Mecab()
print(mecab.pos(train['title'][0]))

[('‘', 'SY'), ('효자', 'NNG'), ('폰', 'NNG'), ('’', 'SY'), ('갤럭시', 'NNP'), ('S', 'SL'), ('7', 'SN'), ('‘', 'SY'), ('불효', 'NNG'), ('폰', 'NNG'), ('’', 'SY'), ('G', 'SL'), ('5', 'SN'), ('…', 'SE'), ('분위기', 'NNG'), ('극', 'NNG'), ('과', 'JC'), ('극', 'NNG')]


## 1. N-gram extraction to build pre-trained dict
<br> 10,000개의 train data를 활용해서 n-gram dict를 만듭니다

In [106]:
from collections import defaultdict

def to_ngrams(words, n):
    ngrams = []
    for idx in range(len(words) - n + 1):
        ngrams.append(tuple(words[idx:idx+n]))
    return ngrams

def ngram_extractor(docs:list, min_count=5, n_range=(1,2)):
    
    '''
        input
        docs -> list whose elements are strings
        min_count -> minimum threshold for dictionary, default 5.
        n_range -> range of n-gram, default is for unigram and bigram.
        
        output
        ngram_counter -> counting dictionary for n-gram
    '''

    ngram_counter = defaultdict(int) # count를 위한 dictionary를 만들기 위해 defaultdict 사용
    begin, end = n_range
    
    for idx in range(len(docs)):
        
        if idx % 1000 == 0:
            print(f'processing {idx}th document...')
        
        unigrams = mecab.pos(docs[idx], join=True)
        for n in range(begin, end + 1):
            for ngram in to_ngrams(unigrams, n):
                ngram_counter[ngram] += 1

    # set min_count for the size of dict
    ngram_counter = {
        ngram:count for ngram, count in ngram_counter.items()
        if count >= min_count
    }
    
    return ngram_counter

ngram_dic = ngram_extractor(train['contents'])

processing 0th document...
processing 1000th document...
processing 2000th document...
processing 3000th document...
processing 4000th document...
processing 5000th document...
processing 6000th document...
processing 7000th document...
processing 8000th document...
processing 9000th document...


In [None]:
# check built vocabs
ngram_dic

## 2. Use N-gram tokenizer
<br> tokenize documents using pre-trained Ngram tokenizer

In [107]:
from collections import defaultdict

class Ngram_tokenizer_MeCab():
    '''
        input
        vocab_dic -> pretrained n-gram vocab dictionary
        n_range -> range for n in n-gram
    '''
    def __init__(self, vocab_dic, n_range=(1,2)):
        self.vocab_dic = vocab_dic
        self.begin, self.end = n_range
        
    def __call__(self, sent):
        return self.tokenize(sent)

    def tokenize(self, sent):
        '''
            input
            sent -> a string or document, which is to be tokenized
            
            output
            ngrams -> tokenized result for given string or document
        '''
        if not sent:
            return []
        
        unigrams = mecab.pos(sent, join=True)
        
        ngrams = []
        for n in range(self.begin, self.end + 1):
            for ngram in self._to_ngrams(unigrams, n):
                ngrams.append('-'.join(ngram)) # to make it clear
        return ngrams

    def _to_ngrams(self, tokens, n): # returns n-gram for given window size n
        ngrams = []
        for idx in range(len(tokens) - n + 1):
            ngram = tuple(tokens[idx:idx+n])
            if ngram in self.vocab_dic:
                ngrams.append(ngram)
        return ngrams

In [108]:
tokenizer = Ngram_tokenizer_MeCab(vocab_dic = ngram_dic)

In [151]:
# test case for unigram and bigram
tokenizer.tokenize('학교에서 수업을 받는다.')

['학교/NNG',
 '에서/JKB',
 '수업/NNG',
 '을/JKO',
 '받/VV',
 '는다/EF',
 './SF',
 '학교/NNG-에서/JKB',
 '수업/NNG-을/JKO',
 '을/JKO-받/VV',
 '받/VV-는다/EF',
 '는다/EF-./SF']

### add new column for title tokenizing

In [110]:
train.head(3)

Unnamed: 0,index,date,category,media,title,contents,link,title_token
0,0,20170108,IT_science,데일리안,‘효자폰’ 갤럭시S7 ‘불효폰’ G5…분위기 극과극,삼성전자 ‘갤럭시S7’ 왼쪽 LG전자 ‘G5’ 오른쪽 모델 이미지 ⓒ각 사 삼성전자...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[‘, 효자, 폰, ’, 갤럭시, S, 7, ‘, 폰, ’, G, 5, …, 분위기..."
1,1,20170111,IT_science,디지털타임스,카카오 O2O 플랫폼 사업 윤곽도 못잡아,11월초 공개 O2O 위드 카카오 두달 넘게 모습 드러내지 않아 후발주자로 출시까지...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[카카오, O, 2, O, 플랫, 폼, 사업, 윤곽, 도, 못, 잡, 아, O-2,..."
2,2,20170103,economy,조세일보,동원 고위도高緯度 북대서양 참다랑어 어획 성공,동원산업 대표이사 이명우 은 고위도 高緯度 북대서양 참다랑어를 어획하는데 성공했다고...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[동원, 고위, 도, 高, 북대서양, 참다랑어, 어획, 성공, 고위-도, 북대서양-..."


In [111]:
train['title_token'] = train['title'].apply(lambda sent: tokenizer.tokenize(sent))
train.head(3)

Unnamed: 0,index,date,category,media,title,contents,link,title_token
0,0,20170108,IT_science,데일리안,‘효자폰’ 갤럭시S7 ‘불효폰’ G5…분위기 극과극,삼성전자 ‘갤럭시S7’ 왼쪽 LG전자 ‘G5’ 오른쪽 모델 이미지 ⓒ각 사 삼성전자...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[‘/SY, 효자/NNG, 폰/NNG, ’/SY, 갤럭시/NNP, S/SL, 7/S..."
1,1,20170111,IT_science,디지털타임스,카카오 O2O 플랫폼 사업 윤곽도 못잡아,11월초 공개 O2O 위드 카카오 두달 넘게 모습 드러내지 않아 후발주자로 출시까지...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[카카오/NNP, O/SL, 2/SN, O/SL, 플랫/NNG, 폼/NNG, 사업/..."
2,2,20170103,economy,조세일보,동원 고위도高緯度 북대서양 참다랑어 어획 성공,동원산업 대표이사 이명우 은 고위도 高緯度 북대서양 참다랑어를 어획하는데 성공했다고...,https://news.naver.com/main/read.nhn?mode=LSD&...,"[동원/NNG, 고위/NNG, 도/JX, 高/XPN, 북대서양/NNG, 참다랑어/N..."


In [112]:
print(train['title_token'][0])

['‘/SY', '효자/NNG', '폰/NNG', '’/SY', '갤럭시/NNP', 'S/SL', '7/SN', '‘/SY', '폰/NNG', '’/SY', 'G/SL', '5/SN', '…/SE', '분위기/NNG', '극/NNG', '과/JC', '극/NNG', '폰/NNG-’/SY', '갤럭시/NNP-S/SL', 'S/SL-7/SN', '폰/NNG-’/SY', 'G/SL-5/SN']


### form DTM using scikit-learn

In [122]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(
    tokenizer = tokenizer,
    lowercase = False,
)
X_train = vectorizer.fit_transform(train['title'])
X_train.shape # (10000, 28962)

(10000, 28962)

### Logistic Regression and its coefficient

In [128]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [142]:
IT_idxs_coef = list(enumerate(classifier.coef_[0]))
IT_idxs = sorted(IT_idxs_coef, key=lambda x:-x[1])[:50]

eco_idxs_coef = list(enumerate(classifier.coef_[1]))
eco_idxs = sorted(eco_idxs_coef, key=lambda x:-x[1])[:50]

poli_idxs_coef = list(enumerate(classifier.coef_[2]))
poli_idxs = sorted(poli_idxs_coef, key=lambda x:-x[1])[:50]

vocab2idx = vectorizer.vocabulary_
idx2vocab = list(sorted(vocab2idx, key=lambda x:vocab2idx[x]))

In [143]:
IT_idxs = [(idx2vocab[idx], coef) for (idx, coef) in IT_idxs]
IT_idxs

[('KT/SL', 1.5979724197041187),
 ('SW/SL', 1.450867036606296),
 ('데이터/NNG', 1.4330650600668249),
 ('네이버/NNP', 1.4241360798791471),
 ('애플/NNP', 1.380759796232405),
 ('통신/NNG', 1.2919508474257113),
 ('게임/NNG', 1.2888698870621846),
 ('IT/SL', 1.2224886996069422),
 ('안랩/NNP', 1.2092125903753634),
 ('앱/NNG', 1.2032265095844688),
 ('드론/NNP', 1.1874971262869634),
 ('구글/NNG', 1.1597853252911219),
 ('우주/NNG', 1.1101076949468858),
 ('페이스북/NNP', 1.0750114858705384),
 ('포켓몬/NNP', 1.0738674773986174),
 ('ICT/SL', 1.0567906154586622),
 ('CES/SL', 1.0281080310898072),
 ('인공지능/NNP', 1.018734340161225),
 ('갤럭시/NNP', 1.0183396858541898),
 ('스마트폰/NNP', 1.0155395712234492),
 ('정보/NNG', 1.0135350921868391),
 ('클라우드/NNP', 1.0134222728522422),
 ('VR/SL', 1.0045475364324832),
 ('방통/NNG', 0.9978756975443422),
 ('미래/NNG', 0.9961895779130883),
 ('미래/NNG-부/NNG', 0.9898880607168079),
 ('LGU/SL', 0.9726892864960273),
 ('삼성전자/NNP', 0.9693233786820062),
 ('아이폰/NNP', 0.9690464837257594),
 ('시스템/NNG', 0.968572117566186

In [141]:
eco_idxs = [(idx2vocab[idx], coef) for (idx, coef) in eco_idxs]
eco_idxs

[('사진/NNG', 1.6198393499161001),
 ('회장/NNG', 1.6178383648536612),
 ('아파트/NNG', 1.2376654858177827),
 ('보험/NNG', 1.2148156307025737),
 ('공시/NNG', 1.2125207482560671),
 ('fnRASSI/SL', 1.2040752748953176),
 ('특징/NNG', 1.2026348511675784),
 ('계란/NNG', 1.2025839755846757),
 ('임종룡/NNP', 1.1956974560825069),
 ('달걀/NNG', 1.161189849554328),
 ('코스피/NNP', 1.1539288259077243),
 ('인천/NNP', 1.1319864149934808),
 ('세트/NNG', 1.1233155323037682),
 ('분양/NNG', 1.0715134218130788),
 ('총재/NNG', 1.0335128579058572),
 ('금융/NNG', 0.9829621649844269),
 ('증시/NNG', 0.9762260238314097),
 ('부동산/NNG', 0.9600361984769104),
 ('사진/NNG-특검/NNG', 0.9514288734321491),
 ('닭/NNG', 0.9423427449140709),
 ('현대/NNP', 0.9341769679655005),
 ('제주항공/NNP', 0.8941722262070202),
 ('전용/NNP', 0.88441112537203),
 ('증권/NNG', 0.8754701358560648),
 ('롯데/NNP', 0.8415558076738103),
 ('코엑스/NNP', 0.8391749621410178),
 ('선물/NNG', 0.8314057760670115),
 ('출시/NNG', 0.8109240249934849),
 ('세일/NNG', 0.8012469885609272),
 ('수확/NNG', 0.795156929311489

In [144]:
poli_idxs = [(idx2vocab[idx], coef) for (idx, coef) in poli_idxs]
poli_idxs

[('문재인/NNP', 2.0983167282649786),
 ('北/NNG', 1.7552245666788167),
 ('정유라/NNP', 1.7028640109550852),
 ('김정은/NNP', 1.6008885508377082),
 ('추미애/NNP', 1.5832161406170369),
 ('북한/NNP', 1.5378822567953647),
 ('이재명/NNP', 1.4494558931175145),
 ('국회/NNG', 1.4352196290866066),
 ('대통령/NNG', 1.4329484455841466),
 ('촛불/NNG', 1.416429495007603),
 ('의원/NNG', 1.4124680927001934),
 ('반기문/NNP', 1.3621073057334703),
 ('청와대/NNP', 1.359797019944226),
 ('민주당/NNP', 1.3050405459881287),
 ('박지원/NNP', 1.2808542501697382),
 ('특검/NNG', 1.2664327402389928),
 ('대선/NNG', 1.2404242488997166),
 ('박원순/NNP', 1.1196879908971318),
 ('文/NNG', 1.1129003180295758),
 ('단독/NNG', 1.1070811082874243),
 ('개헌/NNG', 1.1052421237438566),
 ('김무성/NNP', 1.0586769556208582),
 ('원내대표/NNP', 1.0578238685674874),
 ('인명진/NNP', 1.0156979299591524),
 ('인사/NNG-하/XSV', 1.0059147680516107),
 ('서청원/NNP', 0.9810066662091056),
 ('유승민/NNP', 0.9803239588631933),
 ('보수/NNG', 0.9724985244033625),
 ('軍/NNG', 0.9674655454801091),
 ('탈당/NNG', 0.96149982962

### test

In [145]:
X_test = vectorizer.transform(test['title'])
y_pred = classifier.predict(X_test)

In [149]:
y_true = test['category']
from sklearn.metrics import f1_score
f1_score(y_true, y_pred, average = 'weighted')

0.8578050369874635

In [None]:
# plot confusion matrix and see which category is well-classified
# what do we need to do more to enhance its performance?