LDA 이용한 TOPIC MODELING

In [25]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all', random_state = 156)
print(len(news.target_names),news.target_names)

20 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


* 8개 분야에서만 주제 추출

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 뉴스분야 8개 추출
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med' ]

news= fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'), 
                        categories=cats, random_state=0)

# counbt 기반 피처 벡터화
cnt_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, 
                           stop_words='english', ngram_range=(1,2))
features = cnt_vect.fit_transform(news.data)

# LDA 토픽 모델링
lda = LatentDirichletAllocation(n_components=8, random_state=0) #n_components 토픽 개수 조정 
lda.fit(features)

print(news.target_names)
print('\n')
print(lda.components_)

['comp.graphics', 'comp.windows.x', 'rec.motorcycles', 'rec.sport.baseball', 'sci.electronics', 'sci.med', 'soc.religion.christian', 'talk.politics.mideast']


[[3.60992018e+01 1.35626798e+02 2.15751867e+01 ... 3.02911688e+01
  8.66830093e+01 6.79285199e+01]
 [1.25199920e-01 1.44401815e+01 1.25045596e-01 ... 1.81506995e+02
  1.25097844e-01 9.39593286e+01]
 [3.34762663e+02 1.25176265e-01 1.46743299e+02 ... 1.25105772e-01
  3.63689741e+01 1.25025218e-01]
 ...
 [3.60204965e+01 2.08640688e+01 4.29606813e+00 ... 1.45056650e+01
  8.33854413e+00 1.55690009e+01]
 [1.25128711e-01 1.25247756e-01 1.25005143e-01 ... 9.17278769e+01
  1.25177668e-01 3.74575887e+01]
 [5.49258690e+01 4.47009532e+00 9.88524814e+00 ... 4.87048440e+01
  1.25034678e-01 1.25074632e-01]]


* 각 뉴스별 중요 단어 15개 가져오는 함수 생성

In [38]:
def display_topics(model, feature_names, no_top_words):
    topic_news20 = ["모토사이클", "야구", "그래픽스", "윈도우즈", "중동", "기독교", "전자공학", "의학"]
    
    for topic_index, topic in enumerate(model.components_):
        print('Topic #', topic_index + 1, topic_news20[topic_index])

        # 각 토픽별 word 피처 연관도 내림차순 정렬시 값들의 index 반환 .. (1)
        topic_word_indexes = topic.argsort()[::-1] # [::-1] 역순으로 정렬
        top_indexes = topic_word_indexes[:no_top_words]
        
        # (1)의 index로 피처 이름명 추출
        feature_concat = '/'.join([feature_names[i] for i in top_indexes])                
        
        print(feature_concat)
        print(" ")
        
# CountVectorizer 객체의 전체 word 명칭
feature_names = cnt_vect.get_feature_names()

# Topic별 가장 연관도가 높은 word 15개
display_topics(lda, feature_names, 15)

#sklearn 버전이 달라서 get_feature_names 지원안함 원래 결과값 아래 

AttributeError: 'CountVectorizer' object has no attribute 'get_feature_names'

Topic # 1 모토사이클
year/10/game/medical/health/team/12/20/disease/cancer/1993/games/years/patients/good
 
Topic # 2 야구
don/just/like/know/people/said/think/time/ve/didn/right/going/say/ll/way
 
Topic # 3 그래픽스
image/file/jpeg/program/gif/images/output/format/files/color/entry/00/use/bit/03
 
Topic # 4 윈도우즈
like/know/don/think/use/does/just/good/time/book/read/information/people/used/post
 
Topic # 5 중동
armenian/israel/armenians/jews/turkish/people/israeli/jewish/government/war/dos dos/turkey/arab/armenia/000
 
Topic # 6 기독교
edu/com/available/graphics/ftp/data/pub/motif/mail/widget/software/mit/information/version/sun
 
Topic # 7 전자공학
god/people/jesus/church/believe/christ/does/christian/say/think/christians/bible/faith/sin/life
 
Topic # 8 의학
use/dos/thanks/windows/using/window/does/display/help/like/problem/server/need/know/run