<a href="https://colab.research.google.com/github/echung2/echung2/blob/master/%ED%95%9C%EA%B5%AD%ED%98%84%EB%8C%80%EB%AC%B8%ED%95%99%EB%8D%B0%EC%9D%B4%ED%84%B0%EB%B6%84%EC%84%9D%EC%97%B0%EA%B5%AC_9%EC%A3%BC%EC%B0%A8_Ngram_%ED%86%A0%ED%94%BD_%EB%AA%A8%EB%8D%B8%EB%A7%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ngram / Topic Modeling

In [None]:
# 나눔고딕
!apt-get update -qq
!apt-get install fonts-nanum* -qq
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# 패키지 설치
!pip install -U gensim kiwipiepy tomotopy nltk flashtext

In [None]:
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from kiwipiepy import Kiwi, Option
kiwi = Kiwi()
kiwi.prepare()

import tomotopy as tp
import sys

from tqdm.notebook import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
font_path = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
plt.rc('font', family='NanumBarunGothic')

import itertools
from collections import Counter

import regex #확장된 정규표현식. 일반 정규표현식은 import re
import nltk
from nltk import collocations

from flashtext import KeywordProcessor
kp = KeywordProcessor()

from gensim.models.phrases import Phrases, Phraser

In [None]:
# 이인직 소설 자료 다운로드
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1AY763FcPXN_iBo_sVMXHugQ8UHFvb_nN' -O lee.xlsx

In [None]:
df = pd.read_excel('lee.xlsx')
df

##### 형태소 분석

In [None]:
# 몇가지 품사 제외한 모든 품사 추출 + 품사 태그 포함
def tokenize_tag(sent):
    res, score = kiwi.analyze(sent)[0] # 첫번째 결과를 사용
    return [word + ('다' if tag.startswith('V') else '')+ '/'+ tag # 동사/형용사에는 '다'를 붙여줌 + / 품사
            for word, tag, _, _ in res
            if not tag.startswith('E') and not tag.startswith('J') and not tag.startswith('S')] # 조사, 어미, 특수기호 및 stopwords에 포함된 단어는 제거

In [None]:
# 조사, 어미, 특수기호 제외한 토크나이징
df['tokens'] = df['paragraph'].progress_map(lambda x:tokenize_tag(x))

In [None]:
 # 불용어 리스트
stopwords = set(['이다/VCP','하다/VV','하다/VX','위하다/VV','되다/VV','있다/VV', '있다/VX','없다/VA','않다/VX','있다/VV','아니하다/VX'])

In [None]:
# 불용어 제거
df['tokens'] = df['tokens'].map(lambda x:[w for w in x if not w in set(stopwords)])

In [None]:
df['tokens'][233]

In [None]:
# 토큰 없는 행 삭제
df[df['tokens'].map(lambda x:len(x)==0)]
df = df[df['tokens'].map(lambda x:len(x)!=0)]

In [None]:
# reset index
df = df.reset_index(drop=True)

In [None]:
# 모든 품사 + 태그
def tokenize_all_tag(sent):
    res, score = kiwi.analyze(sent)[0] # 첫번째 결과를 사용
    return [word + ('다' if tag.startswith('V') else '')+ '/'+ tag # 동사/형용사에는 '다'를 붙여줌 + / 품사
            for word, tag, _, _ in res]

In [None]:
# 모든 형태소 포함한 토크나이징
df['tokens_all'] = df['paragraph'].progress_map(lambda x:tokenize_all_tag(x))

In [None]:
token_list = list(itertools.chain(*df['tokens'].tolist()))

### 연어(Collocation) - nltk 활용

##### Bigram

In [None]:
bi_measures = nltk.collocations.BigramAssocMeasures()

In [None]:
finder = collocations.BigramCollocationFinder.from_words(token_list)

In [None]:
finder.apply_freq_filter(10) # n회 이상 등장한 bigram만 뽑기
finder.nbest(bi_measures.pmi, 20) # 상위 n개

### 학습 기반 Ngram 찾기
https://lovit.github.io/nlp/2018/10/23/ngram/

##### get_ngram_counter (학습데이터 만들기)

In [None]:
def get_ngram_counter(docs, min_count, n_range=(1,3)):

    def to_ngrams(words, n):
        ngrams = []
        for b in range(0, len(words) - n + 1):
            ngrams.append(tuple(words[b:b+n]))
        return ngrams

    n_begin, n_end = n_range
    ngram_counter = defaultdict(int)
    for doc in docs:
        words = tokenize_tag(doc)
        for n in range(n_begin, n_end + 1):
            for ngram in to_ngrams(words, n):
                ngram_counter[ngram] += 1

    ngram_counter = {
        ngram:count for ngram, count in ngram_counter.items()
        if count >= min_count
    }

    return ngram_counter

In [None]:
# 학습 방식
get_ngram_counter(['조니 뎁','조니 뎁','4차 산업 혁명','산업혁명'],2) #n회 이상 등장

In [None]:
ngram_counter = get_ngram_counter(token_list,10) # 10 회 이상 등장한 ngram으로 학습

In [None]:
# 확인
sorted(ngram_counter, key=lambda x:-ngram_counter[x])

##### get_ngram_counter 기반으로 Ngram 형태소 분석기 만들기

In [None]:
class NgramTokenizer:

    def __init__(self, ngrams, base_tokenizer, n_range=(1, 3)):
        self.ngrams = ngrams
        self.base_tokenizer = base_tokenizer
        self.n_range = n_range

    def __call__(self, sent):
        return self.tokenize(sent)

    def tokenize(self, sent):
        if not sent:
            return []

        unigrams = self.base_tokenizer(sent)

        n_begin, n_end = self.n_range
        ngrams = []
        for n in range(n_begin, n_end + 1):
            for ngram in self._to_ngrams(unigrams, n):
                ngrams.append('-'.join(ngram))
        return ngrams

    def _to_ngrams(self, words, n):
        ngrams = []
        for b in range(0, len(words) - n + 1):
            ngram = tuple(words[b:b+n])
            if ngram in self.ngrams:
                ngrams.append(ngram)
        return ngrams


In [None]:
ngram_tokenizer = NgramTokenizer(ngram_counter, tokenize_tag)

In [None]:
tokenize_tag('옥련은 겁이 나다')

In [None]:
# 기존 형태소 분석기와 비교
ngram_tokenizer('옥단은 겁이 나다')

In [None]:
df['ngram_tokens'] = df['paragraph'].progress_map(lambda x:ngram_tokenizer(x))

In [None]:
df['ngram_tokens'][0]

In [None]:
df['ngram_tokens'].map(lambda x:len(x))

##### ngram 학습 결과 확인

In [None]:
vectorizer = CountVectorizer(
    tokenizer = ngram_tokenizer,
    lowercase = False,
)
x = vectorizer.fit_transform(token_list)

In [None]:
for ngram, count in sorted(ngram_counter.items(), key=lambda x:-x[1]):
    if '겁/NNG' in ngram: # 'OO'을 포함하는 bigram
        print(ngram, count)

##### ngram_score로 고도화

In [None]:
def get_ngram_score(ngram_counter, delta=20): #delta : n회 이상 등장한 경우만 점수 계산
    ngrams_ = {}
    for ngram, count in ngram_counter.items():
        if len(ngram) == 1:
            continue
        first = ngram_counter[ngram[:-1]]
        second = ngram_counter[ngram[1:]]
        score = (count - delta) / (first * second)
        if score > 0:
            ngrams_[ngram] = (count, score)
    return ngrams_

ngram_scores = get_ngram_score(ngram_counter)

In [None]:
sorted(ngram_scores.items(), key=lambda x:-x[1][1])

In [None]:
# trigram 확인
trigram_scores = {
    ngram:score for ngram, score in ngram_scores.items()
    if len(ngram) == 3
}

sorted(trigram_scores.items(), key=lambda x:-x[1][1])

### Topic Modeling

##### 1. 신문기사 텍스트 분석 예시 (LDA)

In [None]:
# 빅카인즈 뉴스 데이터 가져오기
# https://drive.google.com/file/d/15_N55LQM1HsjfwmlM6VWPo2stN9DmWoX/view?usp=sharing
!wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=15_N55LQM1HsjfwmlM6VWPo2stN9DmWoX' -O young_2017_2019.xlsx

In [None]:
# 2017 ~ 2019년에 발간된 '청년' 관련 기사 (검색어 : 청년)
young = pd.read_excel("young_2017_2019.xlsx")
young.head()

In [None]:
young = young[young['분석제외 여부'].isna()] #결측이 있는 행만 살리기 (dropna의 반대)

In [None]:
# 본문 없는 기사 삭제
young = young.dropna(subset=['본문'])

In [None]:
# 중복 기사 삭제 (제목 및 본문 기준)
young = young.drop_duplicates(subset=['제목','본문'])

In [None]:
#index 재설정, drop은 기존 index 삭제
young = young.reset_index(drop=True)

In [None]:
young['키워드'] = young['키워드'].str.split(',')
young['키워드']

In [None]:
# 모델 초기화
LDA = tp.LDAModel(k=10,min_df=100,tw=tp.TermWeight.IDF,rm_top=3, seed=2021) # Hyperparameter 숙지 필수

In [None]:
for token in tqdm(young['키워드'].tolist()):
    LDA.add_doc(token)

In [None]:
# 한번에 20회씩 총 500회 학습
print('Num docs:', len(LDA.docs), ', Vocab size:', LDA.num_vocabs, ', Num words:', LDA.num_words)
print('Removed top words:', LDA.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
for i in range(0, 500, 20):
    LDA.train(20)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, LDA.ll_per_word))

In [None]:
# 학습 결과
LDA.summary()

In [None]:
# 토픽별 상위 단어 N개 확인
for i in range(LDA.k):
    res = LDA.get_topic_words(i, top_n=10) # top 10
    print('Topic #{}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

##### 2. 신문기사 텍스트 분석 예시 (DMR)
"언론사 성향에 따른 청년 담론의 차이"

In [None]:
young['언론사'].value_counts()

In [None]:
# 언론사_성향 컬럼 추가
young['언론사_성향'] = pd.Series()

In [None]:
young['언론사_성향'][young['언론사'].isin(['조선일보','중앙일보','동아일보'])] = '보수'
young['언론사_성향'][young['언론사'].isin(['경향신문','한겨레'])] = '진보'

In [None]:
young['언론사_성향'].value_counts()

In [None]:
DMR = tp.DMRModel(k=10,min_df=100,tw=tp.TermWeight.PMI,rm_top=3, seed=2021)

In [None]:
token_politics_dict = young[['키워드','언론사_성향']].to_dict('index')

In [None]:
for k in tqdm(token_politics_dict.keys()):
    DMR.add_doc(token_politics_dict[k]['키워드'],token_politics_dict[k]['언론사_성향'])

In [None]:
# 학습 준비
DMR.train(0)

In [None]:
# 한번에 20회씩 총 500회 학습
print('Num docs:', len(DMR.docs), ', Vocab size:', DMR.num_vocabs, ', Num words:', DMR.num_words)
print('Removed top words:', DMR.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
for i in range(0, 500, 20):
    DMR.train(20)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, DMR.ll_per_word))

In [None]:
DMR.summary()

In [None]:
for i in range(DMR.k):
    res = DMR.get_topic_words(i, top_n=10)
    print('Topic #{}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

In [None]:
DMR.metadata_dict

In [None]:
# calculate topic distribution for each metadata using softmax
probs = np.exp(DMR.lambdas - DMR.lambdas.max(axis=0))
probs /= probs.sum(axis=0)

print('명목변수 별 토픽 비율')
for f, metadata_name in enumerate(DMR.metadata_dict):
    print(metadata_name, probs[:, f], '\n')

x = np.arange(DMR.k)
width = 1 / (DMR.f + 2)

fig, ax = plt.subplots()
for f, metadata_name in enumerate(DMR.metadata_dict):
    ax.bar(x + width * (f - DMR.f / 2), probs[:, f], width, label=DMR.metadata_dict[f])

ax.set_ylabel('Probabilities')
ax.set_yscale('log')
ax.set_title('명목변수에 따른 토픽 비율')
ax.set_xticks(x)
ax.set_xticklabels(['Topic #{}'.format(k) for k in range(DMR.k)],rotation=45) # x축 라벨 45도로 회전
ax.legend()

fig.tight_layout()
plt.show()

##### 이인직 작품별 토픽 차이(DMR)


In [None]:
df['title'].value_counts()

In [None]:
# bigram으로 만들기
def make_bigram(sents):
    bigram = Phrases(sents,min_count=50) # n회 이상 나온 단어만 bigram
    bigram_mod = Phraser(bigram)
    sents_bigram = [bigram_mod[doc] for doc in sents]
    return sents_bigram

In [None]:
sents_bigram = make_bigram(df['tokens'].to_list())

In [None]:
# bigram 단어 확인
bigram_list = [word for sent in sents_bigram for word in sent if "_" in word]
Counter(bigram_list).most_common(100)

In [None]:
df['bigram'] = pd.Series(sents_bigram)

In [None]:
# 1음절 단어 삭제
hangul_1 = regex.compile(r'^\p{Hangul}{1}$')
hangul_1_except = regex.compile(r'^(?!겁/NNG|돈/NNG|꿈/NNG|산/NNG)\p{Hangul}{1}/\w+$')
df['bigram'] = df['bigram'].map(lambda x:[w for w in x if not hangul_1_except.match(w)])

In [None]:
# 빈 토큰이 있는 행 삭제
df = df[df['bigram'].map(lambda x:len(x)!=0)]

In [None]:
token_title_dict = df[['bigram','title']].to_dict('index')

In [None]:
# 모델 초기화
DMR_lee = tp.DMRModel(k=10,min_df=10,tw=tp.TermWeight.PMI, seed=2021)

In [None]:
for k in tqdm(token_title_dict.keys()):
    DMR_lee.add_doc(token_title_dict[k]['bigram'],token_title_dict[k]['title'])

In [None]:
# 한번에 20회씩 총 500회 학습
print('Num docs:', len(DMR_lee.docs), ', Vocab size:', DMR_lee.num_vocabs, ', Num words:', DMR_lee.num_words)
print('Removed top words:', DMR_lee.removed_top_words)
print('Training...', file=sys.stderr, flush=True)
for i in range(0, 500, 20):
    DMR_lee.train(20)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, DMR_lee.ll_per_word))

In [None]:
DMR_lee.summary()

In [None]:
for i in range(DMR_lee.k):
    res = DMR_lee.get_topic_words(i, top_n=10)
    print('Topic #{}'.format(i), end='\t')
    print(', '.join(w for w, p in res))

In [None]:
DMR_lee.metadata_dict

In [None]:
# calculate topic distribution for each metadata using softmax
probs = np.exp(DMR_lee.lambdas - DMR_lee.lambdas.max(axis=0))
probs /= probs.sum(axis=0)

print('명목변수 별 토픽 비율')
for f, metadata_name in enumerate(DMR_lee.metadata_dict):
    print(metadata_name, probs[:, f], '\n')

x = np.arange(DMR_lee.k)
width = 1 / (DMR_lee.f + 2)

fig, ax = plt.subplots()
for f, metadata_name in enumerate(DMR_lee.metadata_dict):
    ax.bar(x + width * (f - DMR_lee.f / 2), probs[:, f], width, label=DMR_lee.metadata_dict[f])

ax.set_ylabel('Probabilities')
ax.set_yscale('log')
ax.set_title('명목변수에 따른 토픽 비율')
ax.set_xticks(x)
ax.set_xticklabels(['Topic #{}'.format(k) for k in range(DMR_lee.k)],rotation=45) # x축 라벨 45도로 회전
ax.legend()

fig.tight_layout()
plt.show()