### 0. 비교분석을 위해 LDA model 만들기.

In [1]:
import pandas as pd
import numpy as np
import pickle
from pprint import pprint
import re

In [2]:
with open("./cleaned_data.pk", "rb") as f:
    data = pickle.load(f)

data.reset_index(drop=True, inplace=True)
print(data.head())
print(data.info())

         Date   Speaker timetype   time contents
0  2021-06-14  김태환 형 17       오전   6:01      고민중
1  2021-06-14    이현직 16       오전   8:00      셤잘쳐
2  2021-06-14  김태환 형 17       오전   8:00      귀여워
3  2021-06-14  김태환 형 17       오전  10:07       시발
4  2021-06-14  김태환 형 17       오전  10:08   담배가 쓰다
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Date      980 non-null    object
 1   Speaker   980 non-null    object
 2   timetype  980 non-null    object
 3   time      980 non-null    object
 4   contents  980 non-null    object
dtypes: object(5)
memory usage: 38.4+ KB
None


### 2. 초록 없는 데이터 제거  및 분석시기 설정하기

In [3]:
# 시간정보 열 datetime 정보로 변환
data['Date'] = pd.to_datetime(data['Date'])
# 인덱스 넣기
data = data.set_index('Date')
data.head()

Unnamed: 0_level_0,Speaker,timetype,time,contents
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-06-14,김태환 형 17,오전,6:01,고민중
2021-06-14,이현직 16,오전,8:00,셤잘쳐
2021-06-14,김태환 형 17,오전,8:00,귀여워
2021-06-14,김태환 형 17,오전,10:07,시발
2021-06-14,김태환 형 17,오전,10:08,담배가 쓰다


In [4]:
# LDA를 위해 10일대 데이터만 불러오기.
day20 = data["2021-06-20" : "2021-06-29"]
slice20 = list(day20["contents"])

tokenized_data = [msg.split() for msg in slice20]
print(tokenized_data[:10])
print(len(tokenized_data))

[['위상', '진짜', 'takehome', '시험인데', '30점만점에', '평균이', '어떻게', '17점이나오지'], ['구글링도', '허용인데', '이걸', '던지는', '놈들이', '있네'], ['17점이면'], ['잘나온거아니냐'], ['구글이랑', '강의노트', '보면', '반', '이상은', '나오는데'], ['아', '모기한테', '엉덩이', '물림'], ['프레드릭소렌드', '썅년', '꺼지라'], ['왜', '취소함'], ['걍', '들으셈'], ['개같은련이']]
206


### 3. LDA & Dynamic Topic Model 돌리기

In [5]:
from gensim.models import ldamodel
from gensim.models import ldaseqmodel
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim import corpora
from tqdm import tqdm_notebook
from time import time

import os



##### dictionary와 doc2bow 만들기 ( LDA에서 20일대인 것만 사용)

In [6]:
# Create Dictionary
if not os.path.exists('kakao(LDA)_dict'):
    dictionary = corpora.Dictionary(tokenized_data)
    #dictionary.filter_extremes(no_below=5, no_above=500)  # 이 줄의 코드는 n회 미만 또는 n회 이상을 지울 때 사용
    dictionary.save('kakao(LDA)_dict')
    print(dictionary)
else:
    dictionary = Dictionary.load('kakao(LDA)_dict')

# Term Document Frequency (convert tokenized documents into a Document-Term Matrix)    
if not os.path.exists('kakao(LDA)_corpus'):
    corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]
    corpora.BleiCorpus.serialize('kakao(LDA)_corpus', corpus)
else:
    corpus = bleicorpus.BleiCorpus('kakao(LDA)_corpus')

Dictionary(460 unique tokens: ['17점이나오지', '30점만점에', 'takehome', '시험인데', '어떻게']...)


##### Run LDA model 

In [7]:
# DTM 분석에서 best topic으로 나온 결과를 비교하기 위해 같은 토픽 수로 설정.
NUM_TOPICS = 8

if not os.path.exists('kakao(LDA)_model'):
    lda_model = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, passes=10)
    lda_model.save('kakao(LDA)_model')
else:
    lda_model = ldamodel.LdaModel.load('kakao(LDA)_model')

##### Run DTM Model

In [8]:
# DTM 분석에서 best topic으로 나온 결과를 비교하기 위해 같은 토픽 수로 설정.
NUM_TOPICS = 8

dtm_model = ldaseqmodel.LdaSeqModel.load('kakao_dtm_model_8')

##### LDA와 DTM 결과 비교해보기

In [9]:
lda_model.show_topic(topicid=0, topn=30)

[('다', 0.014916272),
 ('락원아', 0.014915749),
 ('진짜', 0.014914357),
 ('집에서', 0.014852213),
 ('이걸', 0.007897338),
 ('토플', 0.007897036),
 ('아니', 0.007896608),
 ('야', 0.007896526),
 ('이상은', 0.007896374),
 ('강의노트', 0.007896374),
 ('놈들이', 0.007896374),
 ('그럴거야', 0.007896374),
 ('시험', 0.007896373),
 ('보면', 0.00789637),
 ('던지는', 0.007896369),
 ('반', 0.007896367),
 ('끝나자마자', 0.0078963665),
 ('나오는데', 0.007896366),
 ('구글이랑', 0.007896366),
 ('허용인데', 0.007896365),
 ('구글링도', 0.007896364),
 ('좆지랄해야하는데', 0.007896363),
 ('코드', 0.007896363),
 ('있네', 0.007896362),
 ('입력해서', 0.007896359),
 ('있는', 0.007896359),
 ('책에', 0.007896358),
 ('새끼들', 0.007896349),
 ('이제', 0.007896348),
 ('그건', 0.007896346)]

In [10]:
dtm_model.print_topic(topic=0, time=0, top_terms=30)

[('락원이', 0.043773045261198656),
 ('근데', 0.015395867894693218),
 ('난', 0.015376082669562426),
 ('그냥', 0.014980014261704736),
 ('니', 0.012296753140714817),
 ('내일', 0.012269519698708608),
 ('아', 0.012267615419440969),
 ('나', 0.012265783201230607),
 ('오', 0.012214896412748154),
 ('오늘', 0.012074870422137661),
 ('진짜', 0.009236384252761767),
 ('정훈아', 0.009229398985270667),
 ('with', 0.0092284050728445),
 ('아까', 0.009228189342547622),
 ('옷이랑', 0.009228189342547622),
 ('입는', 0.009228189342547622),
 ('좀', 0.009228189342547622),
 ('이렇게', 0.009228189342547615),
 ('홀리', 0.009228189342547615),
 ('도서관', 0.009224228878758987),
 ('추운데', 0.009224228878758987),
 ('요새', 0.009210241417696486),
 ('낫네', 0.009210241417696462),
 ('내가', 0.0038721881637046734),
 ('고민중', 0.0033887326791533635),
 ('함', 0.002692003527814055),
 ('저', 0.002692003527814055),
 ('락', 0.002692003527814055),
 ('풀약', 0.0026920035278140526),
 ('목금', 0.0026669014143137393)]

##### coherence score 계산 비교

In [11]:
# DTM coherence score 비교를 위해 데이터 불러오기.

dtm_corpus = corpus = bleicorpus.BleiCorpus('kakao(DTM)_corpus')
dtm_dictionary = Dictionary.load('kakao(DTM)_dict')
processing_data = [msg.split() for msg in data['contents']]

In [12]:
# 각 모델별 coherence score 계산.
lda_cs = CoherenceModel(model=lda_model, texts=tokenized_data, corpus=corpus, dictionary=dictionary, coherence='c_v').get_coherence()
topics_dtm = dtm_model.dtm_coherence(time=2)
dtm_cs = CoherenceModel(topics=topics_dtm, texts=processing_data, corpus=dtm_corpus, dictionary=dtm_dictionary, coherence='c_v').get_coherence()

In [13]:
# 결과 확인.
print("Coherence Score for LDA : %.3f" % lda_cs)
print("Coherence Score for DTM : %.3f" % dtm_cs)

Coherence Score for LDA : 0.535
Coherence Score for DTM : 0.757
