# LDA Topic Modeling

In [1]:
# Tokenized Data Load
import ast
import pandas as pd

save_dir = ('../data/joongang_accident_token_df.csv')
df = pd.read_csv(save_dir)
df['tokenized'].head()


# list 형태를 하고있지만 실제로는 str이기 때문에 list로 변환
df['tokenized'] = df['tokenized'].map(lambda x: ast.literal_eval(x))
type(df['tokenized'][0])
result = [sentence for sentence in df['tokenized']]
for line in result[:3]:
  print(line)

['1일', '오후', '10', '시', '31', '분께', '대구', '달서구의', '한', '4층짜리', '모텔', '건물', '3층', '객실에', '서', '불이', '났다', '화재', '가', '발생', '하자', '건물', '안에', '있던', '손님', '30', '여명이', '급하게', '대피했', '으나', '불이', '시작된', '객실', '안에', '있던', '40대', '남성', 'A씨', '가', '연기를', '흡입', '해', '병원', '으로', '옮겨', '졌다', '또', '객실', '안', '침대', '등이', '불에', '탔다', '소방당국은', '소방', '차', '24대와', '소방', '관', '50', '명을', '투입해', '화재', '발생', '10', '여분', '뒤인', '오후', '10', '시', '45분께', '진화', '를', '완료했다', '대구소방본부', '관계자는', '화재', '원인', '등을', '조사', '하고', '있다', '고', '밝혔다', '연합뉴스']
['80대', '노모와', '지체', '장애', '를', '가진', '50대', '아들이', '집에서', '숨진', '채', '발견', '돼', '경찰', '이', '수사', '에', '착수했다', '1일', '서울', '강서경찰서에', '따르면', '이날', '오전', '4시께', '서울', '강서구', '의', '한', '아파트에서', '이', '집에', '사는', '80대', '여성', 'A씨', '와', '아들', '인', '50대', '남성', 'B씨', '가', '숨진', '채', '발견됐다', 'B씨', '는', 'A씨', '의', '큰아들로', '지체', '장애가', '있어', '평소', '거동이', '불편했던', '것으로', '알려졌다', '경찰', '관계자는', '모자의', '시신', '에서', '둔기', '에', '의한', '외상', '흔적', '이', '발견됐', '고', '타살', '혐의', '점이', '있다'

## Encoding & Vocab Set

In [2]:
from gensim import corpora
# 정수 인코딩과 빈도수 생성
dictionary = corpora.Dictionary(result)
corpus = [dictionary.doc2bow(text) for text in result]
print(corpus[0])

[(0, 3), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 2), (13, 1), (14, 2), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 2), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 2), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 2), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 3), (70, 1)]


In [3]:
# 총 학습된 단어 수
len(dictionary)

291535

## Model

In [4]:
import gensim
NUM_TOPICS = 10 #토픽 개수 지정
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=20)
# topics = ldamodel.print_topics()
# for topic in topics:
#     print(topic)

In [5]:
topics = ldamodel.print_topics()
for topic in topics:
    print(topic)

(0, '0.020*"사건" + 0.015*"DNA" + 0.009*"재심" + 0.009*"화성" + 0.007*"살인" + 0.007*"8차" + 0.007*"공소시효" + 0.006*"가석방" + 0.006*"윤씨" + 0.005*"당시"')
(1, '0.012*"를" + 0.011*"피해자" + 0.010*"을" + 0.010*"는" + 0.009*"에" + 0.009*"범죄" + 0.008*"등" + 0.007*"있다" + 0.007*"고" + 0.007*"피해"')
(2, '0.012*"등" + 0.012*"는" + 0.009*"고" + 0.009*"가" + 0.008*"해" + 0.007*"를" + 0.007*"을" + 0.007*"안전" + 0.007*"지난" + 0.006*"발생"')
(3, '0.019*"이" + 0.017*"수사" + 0.016*"검찰" + 0.014*"에" + 0.013*"는" + 0.013*"의" + 0.012*"을" + 0.012*"를" + 0.012*"전" + 0.011*"은"')
(4, '0.019*"재판" + 0.019*"이" + 0.016*"혐의" + 0.015*"전" + 0.014*"에" + 0.013*"을" + 0.012*"의" + 0.010*"고" + 0.010*"는" + 0.010*"은"')
(5, '0.011*"이" + 0.010*"연합뉴스" + 0.010*"가" + 0.010*"있다" + 0.009*"화재" + 0.009*"사고" + 0.009*"것으로" + 0.009*"한" + 0.009*"는" + 0.008*"오후"')
(6, '0.024*"고" + 0.016*"이" + 0.015*"는" + 0.011*"의" + 0.010*"가" + 0.010*"을" + 0.010*"며" + 0.009*"에" + 0.009*"한" + 0.008*"은"')
(7, '0.034*"코로나" + 0.022*"19" + 0.012*"집회" + 0.011*"서울" + 0.010*"바이러스" + 0.010*"신종" + 0.00

In [6]:
dictionary.filter_extremes(no_below=2, no_above=1.0)

In [7]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis, template_type='notebook')