# 토픽 모델링 - LDA

## 20 NewsGroup 데이터 사례

In [40]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [5]:
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(
    subset='all',
    random_state=2021,
    remove=('headers', 'footers', 'quotes')
)

In [9]:
df = pd.DataFrame({'article': news.data})
df.shape

(18846, 1)

In [10]:
df.article[0][:1000]

"\nJust in case the original poster was looking for a serious answer,\nI'll supply one.\n\nYes, even when steering no hands you do something quite similar\nto countersteering.  Basically to turn left, you to a quick wiggle\nof the bike to the right first, causing a counteracting lean to\noccur to the left.  It is a lot more difficult to do on a motorcycle\nthan a bicycle though, because of the extra weight.  (Ok, so my\nmotorcycle is heavy.  Maybe yous isn't.)"

In [11]:
# 특수 문자 제거
df['article'] = df.article.str.replace('[^A-Za-z]', ' ')

In [15]:
# 소문자로 변환하고 길이가 3 이하인 단어 제거
df['article'] = df.article.apply(lambda x: ' '.join(w.lower() for w in x.split() if len(w) > 3))
df.article[0][:1000]

'just case original poster looking serious answer supply even when steering hands something quite similar countersteering basically turn left quick wiggle bike right first causing counteracting lean occur left more difficult motorcycle than bicycle though because extra weight motorcycle heavy maybe yous'

- NLTK를 통한 불용어 처리 및 단어 토큰화 

In [17]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [19]:
# 불용어 처리 및 토큰화
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = df.article.apply(lambda x: [w for w in x.split() if w not in stop_words])

In [20]:
tokenized_doc[:5]

0    [case, original, poster, looking, serious, ans...
1    [thinking, sending, magazine, idea, parody, bo...
2    [dreamed, great, judgment, morning, dawned, tr...
3    [file, bignums, ripem, last, updated, april, r...
4    [peanut, butter, definitely, favorite, think, ...
Name: article, dtype: object

## 정수 인코딩과 단어 집합 만들기 - gensim

In [21]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)

In [22]:
len(dictionary)

83145

In [25]:
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1)]


In [27]:
dictionary[0] # 딕셔너리 그룹의 인덱스 순서로 나온 횟수 표시 (0, 1) 0번인덱스, 1번 나옴

'answer'

## LDA 모델 훈련시키기

In [30]:
from gensim.models.ldamodel import LdaModel
NUM_TOPICS = 20

In [41]:
ldamodel = LdaModel(
    corpus, num_topics=NUM_TOPICS, random_state=2021,
    id2word=dictionary, passes=20
)# id2word 대상선택 passes 반복횟수 random_state 속성이 있어 고정값을 줄수 있음
topics = ldamodel.print_topics(num_words=4)   
for topic in topics:
  print(topic)

(0, '0.014*"hockey" + 0.013*"team" + 0.007*"city" + 0.007*"april"')
(1, '0.011*"bike" + 0.010*"water" + 0.009*"engine" + 0.008*"cars"')
(2, '0.005*"bosnian" + 0.005*"serbs" + 0.004*"world" + 0.004*"deleted"')
(3, '0.011*"appears" + 0.009*"candida" + 0.009*"wolverine" + 0.008*"cover"')
(4, '0.013*"israel" + 0.011*"turkish" + 0.010*"armenian" + 0.010*"jews"')
(5, '0.012*"government" + 0.008*"president" + 0.007*"public" + 0.006*"would"')
(6, '0.020*"jesus" + 0.015*"church" + 0.013*"bible" + 0.012*"christ"')
(7, '0.012*"drive" + 0.010*"would" + 0.009*"windows" + 0.009*"system"')
(8, '0.020*"space" + 0.007*"nasa" + 0.007*"research" + 0.005*"earth"')
(9, '0.020*"file" + 0.013*"window" + 0.010*"server" + 0.010*"windows"')
(10, '0.023*"mail" + 0.020*"please" + 0.014*"send" + 0.013*"list"')
(11, '0.013*"health" + 0.012*"medical" + 0.008*"disease" + 0.008*"cancer"')
(12, '0.012*"people" + 0.011*"would" + 0.007*"think" + 0.006*"believe"')
(13, '0.005*"linux" + 0.004*"yalcin" + 0.004*"onur" + 0.00

## 훈련결과 시각화

In [37]:
# 최신 버전의 pyLDAvis는 현시점('21.9)의 Colab 버전과 맞지 않음
!pip install pyLDAvis==2.1.2 > /dev/null

In [53]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [55]:
pyLDAvis.save_html(vis, 'news_group_20.html')

PC1, PC2 -> Principal Component : 주성분

## 문서 별 토픽 분포도 보기

In [43]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(1, 0.21060997), (14, 0.76570576)]
1 번째 문서의 topic 비율은 [(1, 0.0484265), (3, 0.17474385), (5, 0.17966925), (7, 0.031786557), (11, 0.060401138), (12, 0.091925785), (14, 0.34559038), (15, 0.024157342), (19, 0.037320927)]
2 번째 문서의 topic 비율은 [(6, 0.36186185), (12, 0.12357302), (14, 0.45935187), (15, 0.027110968)]
3 번째 문서의 topic 비율은 [(5, 0.02760349), (7, 0.07531616), (8, 0.025203133), (9, 0.085529715), (10, 0.16337147), (11, 0.015387975), (12, 0.024777709), (14, 0.028632857), (18, 0.11858687), (19, 0.4116757)]
4 번째 문서의 topic 비율은 [(8, 0.31456077), (10, 0.06009591), (12, 0.047664586), (14, 0.5072697), (17, 0.052967172)]


In [56]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = []

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table.append([int(topic_num), round(prop_topic,4), topic_list])
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break

    df = pd.DataFrame(topic_table)
    return(df)

In [57]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,14,0.7657,"[(1, 0.21064316), (14, 0.7656727)]"
1,1,14,0.3456,"[(1, 0.04842794), (3, 0.17474417), (5, 0.17967..."
2,2,14,0.4594,"[(6, 0.36186194), (12, 0.12357302), (14, 0.459..."
3,3,19,0.4117,"[(5, 0.02760364), (7, 0.075316004), (8, 0.0252..."
4,4,14,0.5073,"[(8, 0.31456175), (10, 0.06006793), (12, 0.047..."
5,5,12,0.3767,"[(0, 0.014732278), (2, 0.011818055), (5, 0.064..."
6,6,7,0.428,"[(1, 0.2183159), (7, 0.4280002), (14, 0.201234..."
7,7,12,0.4393,"[(6, 0.39674354), (8, 0.023280067), (12, 0.439..."
8,8,5,0.4024,"[(0, 0.030065289), (3, 0.049999997), (5, 0.402..."
9,9,14,0.2928,"[(5, 0.07132842), (7, 0.22601058), (11, 0.1598..."


## NUM_TOPICS = 24

In [49]:
ldamodel2 = LdaModel(
    corpus, num_topics=24, random_state=2021,
    id2word=dictionary, passes=20
)# id2word 대상선택 passes 반복횟수 random_state 속성이 있어 고정값을 줄수 있음

In [50]:
topics = ldamodel2.print_topics(num_words=4)   
for topic in topics:
  print(topic)

(4, '0.024*"book" + 0.018*"books" + 0.006*"pages" + 0.006*"edition"')
(2, '0.014*"people" + 0.009*"israel" + 0.007*"would" + 0.007*"government"')
(17, '0.013*"year" + 0.011*"runs" + 0.009*"clutch" + 0.008*"average"')
(14, '0.018*"would" + 0.013*"like" + 0.009*"time" + 0.009*"think"')
(0, '0.012*"appears" + 0.009*"cover" + 0.009*"wolverine" + 0.008*"espn"')
(1, '0.015*"cancer" + 0.010*"aids" + 0.009*"vitamin" + 0.009*"doctor"')
(23, '0.017*"health" + 0.017*"medical" + 0.010*"drug" + 0.009*"research"')
(10, '0.019*"mail" + 0.015*"information" + 0.015*"please" + 0.012*"send"')
(16, '0.039*"file" + 0.024*"jpeg" + 0.018*"files" + 0.017*"format"')
(9, '0.021*"windows" + 0.013*"window" + 0.011*"file" + 0.010*"server"')
(5, '0.021*"president" + 0.013*"think" + 0.012*"going" + 0.011*"stephanopoulos"')
(20, '0.007*"state" + 0.006*"states" + 0.005*"national" + 0.005*"american"')
(13, '0.021*"entry" + 0.021*"output" + 0.014*"file" + 0.011*"jumper"')
(6, '0.029*"jesus" + 0.019*"church" + 0.017*"chr

In [52]:
vis2 = pyLDAvis.gensim.prepare(ldamodel2, corpus, dictionary)
pyLDAvis.display(vis2)

In [54]:
pyLDAvis.save_html(vis2, 'news_group_24.html')