In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 모토사이클, 야구, 그래픽스, 윈도우즈, 중동, 기독교, 의학, 우주 주제를 추출. 
cats = ['rec.motorcycles', 'rec.sport.baseball', 'comp.graphics', 'comp.windows.x',
        'talk.politics.mideast', 'soc.religion.christian', 'sci.electronics', 'sci.med'  ]

# 위에서 cats 변수로 기재된 category만 추출. featch_20newsgroups( )의 categories에 cats 입력
news_df= fetch_20newsgroups(subset='all',remove=('headers', 'footers', 'quotes'), 
                            categories=cats, random_state=0)

#LDA 는 Count기반의 Vectorizer만 적용합니다.  
count_vect = CountVectorizer(max_df=0.95, max_features=1000, min_df=2, stop_words='english', ngram_range=(1,2))
feat_vect = count_vect.fit_transform(news_df.data)
print('CountVectorizer Shape:', feat_vect.shape)

CountVectorizer Shape: (7862, 1000)


In [3]:
lda = LatentDirichletAllocation(n_components=8, random_state=0)
lda.fit(feat_vect)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=8, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [4]:
print(lda.components_.shape)
lda.components_

(8, 1000)


array([[4.76415412e+00, 7.32035553e+01, 1.25197989e-01, ...,
        1.22458816e+02, 1.25230793e-01, 1.09228801e+02],
       [1.25128874e-01, 2.72275563e+00, 1.25032067e-01, ...,
        4.05483014e+00, 1.25201248e-01, 1.25936924e-01],
       [1.31387985e-01, 1.25043289e-01, 1.25490482e-01, ...,
        5.34009231e+01, 1.41931826e-01, 4.48156799e+01],
       ...,
       [1.25238487e-01, 1.25171127e-01, 1.25045244e-01, ...,
        2.98496263e+02, 1.25117657e-01, 1.25271339e-01],
       [2.76601555e+00, 3.84121246e+02, 1.25026823e-01, ...,
        1.25961528e+01, 2.20914848e+02, 4.50744569e+01],
       [4.89866042e+02, 8.38133615e+01, 1.85898507e+02, ...,
        4.91485772e+00, 7.17143691e+01, 1.48487732e+01]])

In [5]:
def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print('Topic #',topic_index)

        # components_ array에서 가장 값이 큰 순으로 정렬했을 때, 그 값의 array index를 반환. 
        topic_word_indexes = topic.argsort()[::-1]
        top_indexes=topic_word_indexes[:no_top_words]
        
        # top_indexes대상인 index별로 feature_names에 해당하는 word feature 추출 후 join으로 concat
        feature_concat = ' '.join([feature_names[i] for i in top_indexes])                
        print(feature_concat)

# CountVectorizer객체내의 전체 word들의 명칭을 get_features_names( )를 통해 추출
feature_names = count_vect.get_feature_names()

# Topic별 가장 연관도가 높은 word를 15개만 추출
display_topics(lda, feature_names, 15)


Topic # 0
just time year don said like know didn bike good years got game think going
Topic # 1
image graphics jpeg dos software thanks color data images gif does format computer pc information
Topic # 2
god jesus church christ christian people believe christians bible faith sin paul man life team
Topic # 3
medical research health disease 1993 information patients cancer number hiv study use 10 april treatment
Topic # 4
file program window use server version display available windows ftp output code sun set motif
Topic # 5
don like just know think does people way make good problem want use time ve
Topic # 6
armenian people israel armenians jews turkish jewish israeli government war said turkey arab 000 armenia
Topic # 7
edu com 00 10 dos dos 12 24 pub mail cs 11 16 ac 3d 04
