In [1]:
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [2]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

print('항목 : ', dataset.keys())
print('샘플의 수 : ',len(documents))
print('카테고리 : ', dataset.target_names) # 20 categories

항목 :  dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
샘플의 수 :  11314
카테고리 :  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


# Data Preprocessing

In [3]:
df = pd.DataFrame({'document':documents})
print(df.head(3))

df['clean_doc'] = df['document'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
print(df.clean_doc[:3])

df['clean_doc'] = df['clean_doc'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 3]))
print(df.clean_doc[:3])

df['clean_doc'] = df['clean_doc'].apply(lambda x: x.lower())
print(df.clean_doc[:3])

                                            document
0  Well i'm not sure about the story nad it did s...
1  \n\n\n\n\n\n\nYeah, do you expect people to re...
2  Although I realize that principle is not one o...
0    Well i m not sure about the story nad it did s...
1           Yeah  do you expect people to read the ...
2    Although I realize that principle is not one o...
Name: clean_doc, dtype: object
0    Well sure about story seem biased What disagre...
1    Yeah expect people read actually accept hard a...
2    Although realize that principle your strongest...
Name: clean_doc, dtype: object
0    well sure about story seem biased what disagre...
1    yeah expect people read actually accept hard a...
2    although realize that principle your strongest...
Name: clean_doc, dtype: object


In [4]:
print(df['clean_doc'][1])

yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons


# Tokenizing 

In [13]:
import gensim
from gensim import corpora

In [6]:
stop = stopwords.words('english')
tokenized_doc = df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [word for word in x if word not in stop])

print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [12]:
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]

# check corpus and dictionary
print(len(dictionary), len(corpus))
print(corpus[0])
print(dictionary[22])

64281 11314
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 4), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)]
israels


# Modeling LDA

In [15]:
num_topics = 20
model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15) # passes : the number of iterations

In [19]:
for topic in model.print_topics(num_words=4):
    print(topic)

(0, '0.013*"would" + 0.011*"like" + 0.008*"think" + 0.008*"good"')
(1, '0.012*"people" + 0.010*"would" + 0.005*"jesus" + 0.005*"many"')
(2, '0.019*"sale" + 0.017*"price" + 0.015*"shipping" + 0.014*"offer"')
(3, '0.053*"space" + 0.017*"nasa" + 0.012*"launch" + 0.011*"earth"')
(4, '0.019*"armenian" + 0.018*"turkish" + 0.014*"health" + 0.014*"armenians"')
(5, '0.033*"thanks" + 0.026*"anyone" + 0.025*"know" + 0.025*"would"')
(6, '0.014*"available" + 0.013*"information" + 0.010*"also" + 0.010*"software"')
(7, '0.010*"research" + 0.009*"center" + 0.009*"university" + 0.009*"data"')
(8, '0.011*"cubs" + 0.010*"kent" + 0.009*"compass" + 0.008*"scores"')
(9, '0.023*"ripem" + 0.018*"bits" + 0.014*"part" + 0.012*"random"')
(10, '0.022*"file" + 0.014*"program" + 0.011*"output" + 0.011*"window"')
(11, '0.015*"year" + 0.015*"chicago" + 0.014*"detroit" + 0.013*"york"')
(12, '0.019*"said" + 0.012*"people" + 0.010*"went" + 0.008*"children"')
(13, '0.024*"sound" + 0.012*"sony" + 0.012*"monitors" + 0.009*

In [22]:
for i, topic_list in enumerate(model[corpus]):
    if i==5:
        break
    print(i,'번째 문서의 topic 비율은',topic_list)

0 번째 문서의 topic 비율은 [(0, 0.116479814), (1, 0.6111767), (4, 0.12856787), (9, 0.13063347)]
1 번째 문서의 topic 비율은 [(0, 0.35182634), (1, 0.45541498), (2, 0.028739547), (7, 0.106837645), (8, 0.03742534)]
2 번째 문서의 topic 비율은 [(0, 0.1679781), (1, 0.67871153), (15, 0.13912484)]
3 번째 문서의 topic 비율은 [(0, 0.5967653), (1, 0.065252796), (15, 0.32446572)]
4 번째 문서의 topic 비율은 [(0, 0.34629565), (12, 0.30610335), (17, 0.3161044)]
