In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
len(documents)

11314

In [2]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english') # NLTK로부터 불용어를 받아옵니다.
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

In [4]:
tokenized_doc[:5]

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object

In [5]:
from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1]) # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [6]:
import gensim
NUM_TOPICS = 20 #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.021*"azerbaijan" + 0.015*"karina" + 0.012*"karabakh" + 0.012*"picture"')
(1, '0.021*"health" + 0.015*"medical" + 0.013*"guns" + 0.011*"disease"')
(2, '0.025*"armenian" + 0.024*"jews" + 0.021*"turkish" + 0.019*"armenians"')
(3, '0.014*"nist" + 0.011*"germany" + 0.010*"ncsl" + 0.008*"decenso"')
(4, '0.013*"jesus" + 0.008*"christian" + 0.008*"bible" + 0.007*"believe"')
(5, '0.013*"thanks" + 0.011*"would" + 0.011*"know" + 0.010*"drive"')
(6, '0.021*"file" + 0.013*"program" + 0.010*"files" + 0.009*"available"')
(7, '0.016*"encryption" + 0.014*"chip" + 0.012*"keys" + 0.012*"clipper"')
(8, '0.016*"cross" + 0.013*"phillies" + 0.008*"vpic" + 0.007*"steel"')
(9, '0.016*"radar" + 0.010*"detector" + 0.010*"maine" + 0.008*"reds"')
(10, '0.025*"water" + 0.016*"cover" + 0.014*"neutral" + 0.010*"copies"')
(11, '0.015*"play" + 0.013*"period" + 0.013*"hockey" + 0.012*"game"')
(12, '0.014*"space" + 0.007*"information" + 0.006*"data" + 0.006*"university"')
(13, '0.020*"said" + 0.011*"went" + 0.008*

In [7]:
print(ldamodel.print_topics())

[(0, '0.021*"azerbaijan" + 0.015*"karina" + 0.012*"karabakh" + 0.012*"picture" + 0.012*"openwindows" + 0.011*"azeri" + 0.010*"azerbaijanis" + 0.010*"sleeve" + 0.009*"azeris" + 0.009*"bodies"'), (1, '0.021*"health" + 0.015*"medical" + 0.013*"guns" + 0.011*"disease" + 0.010*"study" + 0.009*"among" + 0.009*"patients" + 0.009*"rate" + 0.009*"drug" + 0.009*"control"'), (2, '0.025*"armenian" + 0.024*"jews" + 0.021*"turkish" + 0.019*"armenians" + 0.013*"turkey" + 0.013*"jewish" + 0.011*"greek" + 0.010*"turks" + 0.009*"muslim" + 0.009*"genocide"'), (3, '0.014*"nist" + 0.011*"germany" + 0.010*"ncsl" + 0.008*"decenso" + 0.006*"finland" + 0.006*"sword" + 0.006*"creed" + 0.006*"april" + 0.006*"promo" + 0.005*"dean"'), (4, '0.013*"jesus" + 0.008*"christian" + 0.008*"bible" + 0.007*"believe" + 0.007*"church" + 0.006*"christians" + 0.005*"true" + 0.005*"faith" + 0.005*"christ" + 0.005*"religion"'), (5, '0.013*"thanks" + 0.011*"would" + 0.011*"know" + 0.010*"drive" + 0.010*"anyone" + 0.010*"please" + 

In [10]:
for i, topic_list in enumerate(ldamodel[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(2, 0.3024449), (3, 0.017191047), (10, 0.017595202), (15, 0.16696566), (16, 0.4834907)]
1 번째 문서의 topic 비율은 [(4, 0.21247756), (10, 0.027626049), (11, 0.027286127), (12, 0.068444334), (14, 0.04111916), (16, 0.6045902)]
2 번째 문서의 topic 비율은 [(2, 0.02164273), (15, 0.28209135), (16, 0.6825209)]
3 번째 문서의 topic 비율은 [(4, 0.18545993), (5, 0.07903979), (7, 0.31767324), (13, 0.03735201), (14, 0.030705806), (16, 0.33881307)]
4 번째 문서의 topic 비율은 [(6, 0.070941694), (11, 0.5246291), (16, 0.37293595)]


  and should_run_async(code)


In [11]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)

  and should_run_async(code)


In [12]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

  and should_run_async(code)


Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,16.0,0.4835,"[(2, 0.30246454), (3, 0.01719105), (10, 0.0175..."
1,1,16.0,0.6046,"[(4, 0.2124694), (10, 0.027626049), (11, 0.027..."
2,2,16.0,0.6825,"[(2, 0.021642968), (15, 0.28210944), (16, 0.68..."
3,3,16.0,0.3388,"[(4, 0.1854665), (5, 0.079038136), (7, 0.31767..."
4,4,11.0,0.5247,"[(6, 0.07077584), (11, 0.5246811), (16, 0.3730..."
5,5,16.0,0.7216,"[(2, 0.054139797), (4, 0.12025933), (10, 0.067..."
6,6,5.0,0.3692,"[(4, 0.055864904), (5, 0.36919996), (6, 0.0912..."
7,7,16.0,0.569,"[(15, 0.38254887), (16, 0.5690251), (18, 0.034..."
8,8,16.0,0.4403,"[(1, 0.031395994), (6, 0.1339345), (14, 0.0361..."
9,9,16.0,0.3745,"[(2, 0.019394146), (5, 0.15080054), (7, 0.0597..."
