In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [2]:
# news data 불러오기
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [11]:
# 전처리
news_df = pd.DataFrame({'documents' : documents})
news_df['clean_doc'] = news_df['documents'].str.replace('[^a-zA-Z]', ' ') # 특수문자 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x : ' '.join([w for w in x.split() if len(w) > 3])) # 글자 3개 이하 다 제거
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x : x.lower()) # 소문자 변환

  news_df['clean_doc'] = news_df['documents'].str.replace('[^a-zA-Z]', ' ') # 특수문자 제거


In [12]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [13]:
# 불용어 제거
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [14]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [15]:
tokenized_doc

0        [well, sure, story, seem, biased, disagree, st...
1        [yeah, expect, people, read, actually, accept,...
2        [although, realize, principle, strongest, poin...
3        [notwithstanding, legitimate, fuss, proposal, ...
4        [well, change, scoring, playoff, pool, unfortu...
                               ...                        
11309    [danny, rubenstein, israeli, journalist, speak...
11310                                                   []
11311    [agree, home, runs, clemens, always, memorable...
11312    [used, deskjet, orange, micros, grappler, syst...
11313    [argument, murphy, scared, hell, came, last, y...
Name: clean_doc, Length: 11314, dtype: object

In [20]:
tokenized_doc[:5]

0    [well, sure, story, seem, biased, disagree, st...
1    [yeah, expect, people, read, actually, accept,...
2    [although, realize, principle, strongest, poin...
3    [notwithstanding, legitimate, fuss, proposal, ...
4    [well, change, scoring, playoff, pool, unfortu...
Name: clean_doc, dtype: object

In [21]:
from gensim import corpora

# 단어의 정수 인코딩과 동시에, 각 뉴스에서 던어의 빈도수 표현
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1])

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [22]:
len(dictionary) # 총 학습된 단어의 수

64281

## gensim 통한 LDA

In [23]:
import gensim

NUM_TOPICS = 20
# passes = 훈련하는 동안 말뭉치 들어가는 개수
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4, )
for topic in topics:
    print(topic)


(0, '0.019*"price" + 0.017*"sale" + 0.016*"bike" + 0.014*"offer"')
(1, '0.020*"game" + 0.018*"team" + 0.015*"year" + 0.014*"games"')
(2, '0.034*"space" + 0.012*"nasa" + 0.007*"data" + 0.007*"launch"')
(3, '0.021*"file" + 0.019*"output" + 0.017*"entry" + 0.011*"program"')
(4, '0.012*"colorado" + 0.010*"scorer" + 0.010*"morris" + 0.009*"nist"')
(5, '0.008*"canada" + 0.007*"compass" + 0.007*"germany" + 0.006*"value"')
(6, '0.014*"widget" + 0.009*"book" + 0.006*"client" + 0.006*"part"')
(7, '0.016*"water" + 0.010*"picture" + 0.007*"radar" + 0.006*"sleeve"')
(8, '0.021*"period" + 0.012*"power" + 0.009*"gordon" + 0.009*"pitt"')
(9, '0.038*"keyboard" + 0.032*"printer" + 0.030*"mouse" + 0.015*"print"')
(10, '0.014*"drive" + 0.010*"card" + 0.009*"system" + 0.009*"like"')
(11, '0.009*"armenian" + 0.008*"people" + 0.008*"israel" + 0.008*"armenians"')
(12, '0.016*"would" + 0.011*"people" + 0.009*"like" + 0.009*"know"')
(13, '0.015*"navy" + 0.014*"cheers" + 0.013*"kent" + 0.008*"tyre"')
(14, '0.010