> # 우크라사태 이후 '러시아 한국 경제'로 검색된 신문기사 1456건 토픽 모델링

> ### [토픽 모델링 첨부 파일 다운로드](https://drive.google.com/file/d/1Wv6y7nsrwMO6pm86dK5nS6NfZIaEJtv1/view?usp=sharing)

> ### 필요 패키지 및 폰트 설치
>> #### 최초 실행 후 주석처리한 뒤 런타임 재시작

In [1]:
# !pip install konlpy
# !pip install wordcloud

# !pip install pyLDAvis

# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

> ### 패키지 불러오기

In [2]:
import re
from datetime import datetime
from time import time
import collections

import requests, json
from bs4 import BeautifulSoup
import urllib.parse as parse

from tqdm import tqdm
import pickle
from pprint import pprint as print

import numpy as np
import pandas as pd
import sklearn as sk

from konlpy.tag import Okt, Komoran # komoran, hannanum, kkma, mecab
import gensim
from gensim.summarization import summarize

# visualization
import matplotlib as mpl
import matplotlib.pylab as plb
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib import rcParams
import seaborn as sns
from wordcloud import WordCloud
import pyLDAvis.gensim_models

# 브라우저에서 바로 그려지도록 
# %matplotlib inline

# 그래프에 retina display 적용
%config InlineBackend.figure_format = 'retina'

# # Colab 의 한글 폰트 설정
# plt.rc('font', family='NanumSquareRound') 

# 유니코드에서  음수 부호설정
mpl.rc('axes', unicode_minus=False)

# Seaborn 테마 설정
sns.set_theme(style="ticks", color_codes=True, font='NanumSquareRound', font_scale=2)

# 워닝 메세지 삭제
import warnings
warnings.filterwarnings(action='ignore')

# font_list = [font.name for font in fm.fontManager.ttflist]
# font_list

  from collections import Iterable


> ### 폰트 선택

In [3]:
font_list = [font.name for font in fm.fontManager.ttflist]
font_list
font_path = '/usr/share/fonts/truetype/nanum/NanumMyeongjo.ttf'

In [4]:
#@title
# 피클 사용법
# list = ['a', 'b', 'c']
# with open('list.txt', 'wb') as f:
#   pickle.dump(list, f)

# with open('list.txt', 'rb') as f:
#   data = pickle.load(f)

> ### 전처리

In [5]:
df = pd.read_csv('러시아 한국 경제_v2_2022041520220224.csv')

In [6]:
def clean_content(content):
  stripped = content[69:-2]
  cleaned = re.sub('[^가-힣ㄱ-ㅣa-zA-Z|0-9.%]', ' ', string=stripped)
  return cleaned

def clean_title(title):
  cleaned = re.sub('[^가-힣ㄱ-ㅣa-zA-Z|0-9.%]', ' ', string=title)
  return cleaned

In [7]:
df['내용'] = df['내용'].apply(clean_content)
df['제목'] = df['제목'].apply(clean_title) + '.'
df['제목내용'] = df['제목'] + ' ' + df['내용']

> ### 사용자 사전 추가 & 토큰화

In [8]:
kmr = Komoran(userdic='user_dictionary.txt')
# kmr = Komoran()

In [9]:
preprocessed = df['제목내용'].apply(kmr.nouns)

> ### 불용어 삭제

In [10]:
with open('stopwords.pickle', 'rb') as f:
  stopwords = pickle.load(f)
with open('news_lst.pickle', 'rb') as f:
  news_lst = pickle.load(f)

more_sw = [
           '정은',
           '정부',
           '일보',
           '유사점',
           '소우',
           '권위',
           '인텔리전스',
           '피켓',
           '회장',
           '결',
           '김화영',
           '벡스',
           '유승준',
           '최경영',
           '현수',
           '소프',
           '경식',
           '입찰'
           '다운로드',
           '스토어',
           '유닛',
           '극동방송',
           '석관',
           '스트리트',
           'WP',
           '평론가',
           '320',
           '승관',
]
stopwords.extend(news_lst)
stopwords.extend(more_sw)

In [11]:
# 불용어 삭제 함수
# 한글자 단어도 삭제

def sw_rmv(row):
  lst = []
  for el in row:
    if el not in stopwords and len(el) > 1:
      lst.append(el)
  return lst

In [12]:
preprocessed = preprocessed.apply(sw_rmv)

> ### 트라이그램화

In [13]:
def to_trigrams(row):
  bigram = gensim.models.Phrases(row, delimiter=b' ')
  trigram = gensim.models.Phrases(bigram[row], delimiter=b' ')  
  bigram_mod = gensim.models.phrases.Phraser(bigram)
  trigram_mod = gensim.models.phrases.Phraser(trigram)
  bigrammed = bigram_mod[row]
  trigrammed = trigram_mod[bigrammed]
  
  return trigrammed

In [14]:
preprocessed = preprocessed.apply(to_trigrams)

> ### TF*IDF 구축

In [15]:
dictionary = gensim.corpora.Dictionary(preprocessed) # 토큰별 아이디 부여

In [16]:
corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in preprocessed] # BoW Corpus 생성

In [17]:
tfidf = gensim.models.TfidfModel(corpus, smartirs='ntc') # TF*IDF 생성

> ### LDA 토픽 모델링 최적 파라미터 찾기

> #### coherence(높을 수록 굿)와 perplexity(낮은 게 굿)를 보면서 fine tuning 가능

In [18]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [19]:
font_path = '/usr/share/fonts/truetype/nanum/NanumMyeongjo.ttf'

In [21]:
# # coherence & perplexity 출력

# from gensim.models.coherencemodel import CoherenceModel

# cm = CoherenceModel(model=lda_model, corpus=tfidf[corpus], coherence='u_mass')
# coherence = cm.get_coherence()
# perplexity = lda_model.log_perplexity(tfidf[corpus])
# print(f'coherence: {coherence}')
# print(f'perplexity: {perplexity}')

> #### 최적 에폭(passes)와 토픽 갯수 찾기

In [22]:
# to find optinal passes(epochs)

from gensim.models.coherencemodel import CoherenceModel

num_topics = 4
chunksize = 2000
# passes = 20
iterations = 400
eval_every = None
random_state = 100

coherences=[]
perplexities=[]
passes_lst=[]

for i in range(10):
    if i==0:
        passes=1
    else:
        passes=i*5
    tic = time()
    lda_model = gensim.models.ldamodel.LdaModel(corpus=tfidf[corpus],
                                                id2word=dictionary,
                                                num_topics=num_topics,
                                                chunksize=chunksize,
                                                passes=passes,
                                                iterations=iterations,
                                                random_state=random_state,
                                                eval_every=eval_every,)
    print(f'epoch: {passes} {time()}')

    cm = CoherenceModel(model=lda_model, corpus=corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    perplexity = lda_model.log_perplexity(corpus)
    coherences.append(coherence)
    perplexities.append(perplexity)
    print(f'Coherence: {coherence}')
    print(f'Perplexity: {perplexity}')

'epoch: 1 1650429362.2934203'
'Coherence: -1.4575189308775511'
'Perplexity: -8.706332512004902'
'epoch: 5 1650429375.6810906'
'Coherence: -3.3835107115322485'
'Perplexity: -8.361286114777043'
'epoch: 10 1650429395.3591647'
'Coherence: -8.121169538569006'
'Perplexity: -8.256776689360263'
'epoch: 15 1650429425.03309'
'Coherence: -12.230674799200683'
'Perplexity: -8.23686797113037'
'epoch: 20 1650429459.8439863'
'Coherence: -13.951188698961245'
'Perplexity: -8.232829132389897'
'epoch: 25 1650429501.9692543'
'Coherence: -12.651691071659002'
'Perplexity: -8.231720148004092'
'epoch: 30 1650429551.3682778'
'Coherence: -11.91704261708377'
'Perplexity: -8.231489511691198'
'epoch: 35 1650429608.9881346'
'Coherence: -11.912197492263626'
'Perplexity: -8.231411211199093'
'epoch: 40 1650429673.701455'
'Coherence: -11.267283818462438'
'Perplexity: -8.231398374106963'
'epoch: 45 1650429745.9438832'
'Coherence: -11.687252636162393'
'Perplexity: -8.231335019416889'


In [None]:
# 트라이그램 사용자 사전 안 쓰고 불용어는 신문사명 107개랑 324개 기초만
# 'epoch: 40 1650354702.8733625'
# 'Coherence: -7.95634932266013'
# 'Perplexity: -8.229182282166434'

In [None]:
# to find optimal num_topics

coherencesT = []
perplexitiesT = []
passes = 15

for i in range(1, 11):
    num_topics = i * 2
    tic = time()
    lda_models = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=num_topics, iterations=400, passes=passes)
    print(f'num_topics: {num_topics} {time()}')

    cm = CoherenceModel(model=lda_models, corpus=corpus, coherence='u_mass')
    coherence = cm.get_coherence()
    perplexity = lda_models.log_perplexity(corpus)
    coherencesT.append(coherence)
    perplexitiesT.append(perplexity)
    print(f'coherence: {coherence}')
    print(f'perplexity: {perplexity}')

'num_topics: 2 1650429815.93547'
'coherence: -0.7978428081144611'
'perplexity: -7.504194580297352'
'num_topics: 4 1650429886.623112'
'coherence: -0.7362180822368033'
'perplexity: -7.46008241680772'
'num_topics: 6 1650429955.6614177'
'coherence: -1.010888063426319'
'perplexity: -7.409224314879416'
'num_topics: 8 1650430027.7603538'
'coherence: -1.1391722677065936'
'perplexity: -7.384213019108516'
'num_topics: 10 1650430104.7280202'
'coherence: -1.7653995923940555'
'perplexity: -7.359822972514359'
'num_topics: 12 1650430188.5990138'
'coherence: -1.679657187407787'
'perplexity: -7.374392646793748'
'num_topics: 14 1650430275.1479144'
'coherence: -2.097280650409019'
'perplexity: -7.331694255936343'
'num_topics: 16 1650430366.979203'
'coherence: -1.314148392407183'
'perplexity: -7.322826440717238'
'num_topics: 18 1650430462.1830494'
'coherence: -1.8250069126144435'
'perplexity: -7.342585371239232'
'num_topics: 20 1650430561.5900762'
'coherence: -1.828219270912293'
'perplexity: -7.32678877758

> ### LDA 토픽 모델링

> #### random_state 고정 후 사용자 사전 및 불용어 사전 업데이트 필수

In [None]:
num_topics = 4 # 토픽 수 설정
chunksize = 2000 # 한번에 처리할 row 수 설정
passes = 40 # 에폭 Number of passes through the corpus during training.
iterations = 400 # Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.
eval_every = None # log 복잡도 계산 수행 유무 설정 1로 설정하면 학습이 2배로 느려짐
random_state = 100 # 재실행 시 같은 결과를 같기 위해 난수 설정

lda_model = gensim.models.ldamodel.LdaModel(corpus=tfidf[corpus],
                                            id2word=dictionary,
                                            num_topics=num_topics,
                                            chunksize=chunksize,
                                            passes=passes,
                                            iterations=iterations,
                                            random_state=random_state,
                                            eval_every=eval_every,)

In [None]:
print(lda_model.print_topics())

In [None]:
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stopwords,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0,
                  font_path=font_path)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(15,15), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, tfidf[corpus], dictionary=lda_model.id2word)
vis

> ### 토큰화 체크

In [None]:
kmr.nouns('김재연')

> ### user_dicttionary.txt

In [None]:
# 우크라사태
# 우크라

# 젤렌스키
# 볼로디미르 젤렌스키
# 데니소바
# 류드밀라 데니소바
# 커비
# 존 커비
# 로버츠
# 케빈 로버츠
# 펜스
# 마이클 펜스
# 아브라모비치
# 로만 아브라모비치
# 캔디
# 닉 캔디
# 크룩스
# 콜린 크룩스
# 아데예모
# 윌리 아데예모

# 루비우
# 돈바스
# 키이우
# 키예프
# 체르니히우
# 루츠크
# 이바노프란키우시크
# 하르키우
# 드니프로
# 크라마토르스크
# 자포리지아
# 미콜라이우
# 오데사
# 헤르손
# 마리우폴
# 크림반도
# 몰도바
# 벨라루스

# 정철동
# 차경식

# 방위산업
# 한화디펜스
# 한화에어로스페이스

# K-9
# K-2

# 해양수산
# 해상운임

# 대통령후보
# 대통령 후보

# 윤당선인
# 윤 당선인

# 크리스피
# 크리스피 크림

# LG컨소시엄

# 라씨로
# 라씨