# 토픽 모델링 (Topic Modeling)

문서 집합에 숨어 있는 주제를 찾아내는 것

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [4]:
cats=['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics','comp.windows.x',
     'talk.politics.mideast','soc.religion.christian','sci.electronics','sci.med']

In [5]:
# cats 변수로 기재된 카테고리만 추출

news_df=fetch_20newsgroups(subset='all', remove=('headers','footers','quotes'), categories=cats, random_state=0)

In [6]:
# LDA는 Count 기반의 벡터화만 적용

count_vect=CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))

feat_vect=count_vect.fit_transform(news_df.data)

print('CountVectorizer Shape :',feat_vect.shape)

CountVectorizer Shape : (7862, 1000)


7862개의 문서가 1000개의피처(단어) 로 구성되었음을 볼 수 있다.

In [7]:
lda=LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)

LatentDirichletAllocation(n_components=8, random_state=0)

LDA 객체는 components_ 라는 속성값을 가지게 되는데,

components_ 는 개별 토픽별로 각 word 피처가 얼마나 많이 그 토픽에 할당됐는지에 대한 수치를 가지고 있다.

높은 값일수록 해당 word 피처는 그 토픽의 중심 word가 된다.

In [8]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[3.60992018e+01, 1.35626798e+02, 2.15751867e+01, ...,
        3.02911688e+01, 8.66830093e+01, 6.79285199e+01],
       [1.25199920e-01, 1.44401815e+01, 1.25045596e-01, ...,
        1.81506995e+02, 1.25097844e-01, 9.39593286e+01],
       [3.34762663e+02, 1.25176265e-01, 1.46743299e+02, ...,
        1.25105772e-01, 3.63689741e+01, 1.25025218e-01],
       ...,
       [3.60204965e+01, 2.08640688e+01, 4.29606813e+00, ...,
        1.45056650e+01, 8.33854413e+00, 1.55690009e+01],
       [1.25128711e-01, 1.25247756e-01, 1.25005143e-01, ...,
        9.17278769e+01, 1.25177668e-01, 3.74575887e+01],
       [5.49258690e+01, 4.47009532e+00, 9.88524814e+00, ...,
        4.87048440e+01, 1.25034678e-01, 1.25074632e-01]])

In [19]:
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #',topic_index)
        
        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array 인덱스를 반환
        topic_word_indexes=topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]
        
        # top_indexes 대상인 인덱스별로 feature_names 에 해당하는 word feature 추출 후 join으로 concat
        feature_concat=' '.join([feature_names[i] for i in top_indexes])
        print(feature_concat)

In [20]:
# CountVectorizer 객체 내의 전체 word의 명칭을 get_feature_names()를 통해 추출

feature_names=count_vect.get_feature_names()

In [21]:
len(feature_names)

1000

In [22]:
feature_names[:10]

['00', '000', '01', '02', '03', '04', '05', '10', '100', '11']

In [23]:
# 토픽별 가장 연관도가 높은 word를 10개 추출

display_topics(lda, feature_names, 10)

Topic # 0
year 10 game medical health team 12 20 disease cancer
Topic # 1
don just like know people said think time ve didn
Topic # 2
image file jpeg program gif images output format files color
Topic # 3
like know don think use does just good time book
Topic # 4
armenian israel armenians jews turkish people israeli jewish government war
Topic # 5
edu com available graphics ftp data pub motif mail widget
Topic # 6
god people jesus church believe christ does christian say think
Topic # 7
use dos thanks windows using window does display help like
