LSA : DTM을 차원 축소하여 축소 차원에서 근접 단어들을 토픽으로 묶는다.
LDA : 단어가 특정 토픽에 존재할 확률과 문서에 특정 토픽이 존재할 확률을 결합확률로 추정하여 토픽을 추출한다.

1. 사용자가 토픽의 개수 k를 정한다.
2. 모든 단어를 k개중 1개의 토픽에 할당
3. 문서의 각 단어 w는 아래 규칙에 따라 재할당 된다.
    - 문서 d 단어들중 토픽 t에 해당하는 단어들의 비율
    - 각 단어 w의 토픽 분포 비율

In [4]:
# 뉴스그룹 데이터
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=("headers", "footers", "quotes"))
documents = dataset.data
len(documents)

11314

In [17]:
documents[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [18]:
print(dataset.target_names)
print(len(dataset.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
20


In [23]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())
news_df

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure about story seem biased what disagre...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize that principle your strongest...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss about this pro...
4,"Well, I will have to change the scoring on my ...",well will have change scoring playoff pool unf...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist will speak...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet with orange micros grappler syste...


In [24]:
news_df["clean_doc"][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [34]:
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
tokenize_doc = news_df["clean_doc"].apply(lambda x : x.split())
tokenize_doc = tokenize_doc.apply(lambda x : [item for item in x if item not in stop_words])
print(tokenize_doc)

0        [well, sure, story, seem, biased, disagree, st...
1        [yeah, expect, people, read, actually, accept,...
2        [although, realize, principle, strongest, poin...
3        [notwithstanding, legitimate, fuss, proposal, ...
4        [well, change, scoring, playoff, pool, unfortu...
                               ...                        
11309    [danny, rubenstein, israeli, journalist, speak...
11310                                                   []
11311    [agree, home, runs, clemens, always, memorable...
11312    [used, deskjet, orange, micros, grappler, syst...
11313    [argument, murphy, scared, hell, came, last, y...
Name: clean_doc, Length: 11314, dtype: object


In [40]:
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenize_doc[i])
    detokenized_doc.append(t)

news_df["clean_doc"] = detokenized_doc
news_df

Unnamed: 0,document,clean_doc
0,Well i'm not sure about the story nad it did s...,well sure story seem biased disagree statement...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re...",yeah expect people read actually accept hard a...
2,Although I realize that principle is not one o...,although realize principle strongest points wo...
3,Notwithstanding all the legitimate fuss about ...,notwithstanding legitimate fuss proposal much ...
4,"Well, I will have to change the scoring on my ...",well change scoring playoff pool unfortunately...
...,...,...
11309,"Danny Rubenstein, an Israeli journalist, will ...",danny rubenstein israeli journalist speaking t...
11310,\n,
11311,\nI agree. Home runs off Clemens are always m...,agree home runs clemens always memorable kinda...
11312,I used HP DeskJet with Orange Micros Grappler ...,used deskjet orange micros grappler system upd...


In [41]:
news_df["clean_doc"][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons'

In [44]:
from gensim import corpora

dictionary = corpora.Dictionary(tokenize_doc)
# (정수 인코딩, 빈도수 형태)
corpus = [dictionary.doc2bow(text) for text in tokenize_doc]
print(corpus[1])

[(52, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 2), (86, 1), (87, 1), (88, 1), (89, 1)]


In [47]:
print(dictionary[66])
print(len(dictionary))

faith
64281


In [49]:
import gensim
NUM_TOPICS = 20
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.013*"government" + 0.011*"people" + 0.010*"right" + 0.008*"state"')
(1, '0.014*"said" + 0.013*"armenian" + 0.013*"people" + 0.012*"armenians"')
(2, '0.009*"system" + 0.008*"encryption" + 0.008*"chip" + 0.007*"public"')
(3, '0.009*"cubs" + 0.007*"sexual" + 0.007*"homosexuality" + 0.007*"homosexual"')
(4, '0.018*"patients" + 0.012*"istanbul" + 0.009*"patient" + 0.009*"chronic"')
(5, '0.005*"used" + 0.005*"also" + 0.005*"time" + 0.005*"current"')
(6, '0.015*"year" + 0.014*"game" + 0.012*"team" + 0.010*"games"')
(7, '0.019*"drive" + 0.015*"card" + 0.014*"windows" + 0.013*"system"')
(8, '0.026*"file" + 0.017*"program" + 0.012*"window" + 0.011*"output"')
(9, '0.016*"israel" + 0.015*"jews" + 0.010*"israeli" + 0.009*"states"')
(10, '0.012*"candida" + 0.010*"borland" + 0.009*"marriage" + 0.008*"syndrome"')
(11, '0.011*"would" + 0.011*"people" + 0.008*"jesus" + 0.007*"believe"')
(12, '0.015*"president" + 0.011*"going" + 0.009*"think" + 0.009*"people"')
(13, '0.014*"available" + 0.013*"inf

In [50]:
print(ldamodel.print_topics())

[(0, '0.013*"government" + 0.011*"people" + 0.010*"right" + 0.008*"state" + 0.008*"rights" + 0.007*"court" + 0.007*"laws" + 0.006*"federal" + 0.006*"amendment" + 0.005*"second"'), (1, '0.014*"said" + 0.013*"armenian" + 0.013*"people" + 0.012*"armenians" + 0.009*"turkish" + 0.007*"went" + 0.007*"killed" + 0.006*"turkey" + 0.006*"children" + 0.005*"came"'), (2, '0.009*"system" + 0.008*"encryption" + 0.008*"chip" + 0.007*"public" + 0.007*"security" + 0.006*"keys" + 0.006*"government" + 0.006*"clipper" + 0.005*"would" + 0.005*"used"'), (3, '0.009*"cubs" + 0.007*"sexual" + 0.007*"homosexuality" + 0.007*"homosexual" + 0.006*"pitcher" + 0.006*"homosexuals" + 0.006*"prophecy" + 0.006*"michael" + 0.006*"ryan" + 0.006*"verses"'), (4, '0.018*"patients" + 0.012*"istanbul" + 0.009*"patient" + 0.009*"chronic" + 0.008*"acid" + 0.008*"erzurum" + 0.007*"nords" + 0.006*"physician" + 0.006*"studies" + 0.005*"treatment"'), (5, '0.005*"used" + 0.005*"also" + 0.005*"time" + 0.005*"current" + 0.004*"image" +

In [51]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [56]:
def make_topictable_per_doc(ldamodel, corpus):
    topic_table = pd.DataFrame()

    # 몇 번째 문서인지를 의미하는 문서 번호와 해당 문서의 토픽 비중을 한 줄씩 꺼내온다.
    for i, topic_list in enumerate(ldamodel[corpus]):
        print(topic_list)
        doc = topic_list[0] if ldamodel.per_word_topics else topic_list            
        doc = sorted(doc, key=lambda x: (x[1]), reverse=True)
        # 각 문서에 대해서 비중이 높은 토픽순으로 토픽을 정렬한다.
        # EX) 정렬 전 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (10번 토픽, 5%), (12번 토픽, 21.5%), 
        # Ex) 정렬 후 0번 문서 : (2번 토픽, 48.5%), (8번 토픽, 25%), (12번 토픽, 21.5%), (10번 토픽, 5%)
        # 48 > 25 > 21 > 5 순으로 정렬이 된 것.

        # 모든 문서에 대해서 각각 아래를 수행
        for j, (topic_num, prop_topic) in enumerate(doc): #  몇 번 토픽인지와 비중을 나눠서 저장한다.
            if j == 0:  # 정렬을 한 상태이므로 가장 앞에 있는 것이 가장 비중이 높은 토픽
                topic_table = topic_table.append(pd.Series([int(topic_num), round(prop_topic,4), topic_list]), ignore_index=True)
                # 가장 비중이 높은 토픽과, 가장 비중이 높은 토픽의 비중과, 전체 토픽의 비중을 저장한다.
            else:
                break
    return(topic_table)


  and should_run_async(code)


In [58]:
topictable = make_topictable_per_doc(ldamodel, corpus)
topictable = topictable.reset_index() # 문서 번호을 의미하는 열(column)로 사용하기 위해서 인덱스 열을 하나 더 만든다.
topictable.columns = ['문서 번호', '가장 비중이 높은 토픽', '가장 높은 토픽의 비중', '각 토픽의 비중']
topictable[:10]

  and should_run_async(code)


Unnamed: 0,문서 번호,가장 비중이 높은 토픽,가장 높은 토픽의 비중,각 토픽의 비중
0,0,11.0,0.2519,"[(0, 0.09026321), (1, 0.082913086), (3, 0.0560..."
1,1,11.0,0.4408,"[(6, 0.047057502), (10, 0.16038689), (11, 0.44..."
2,2,18.0,0.4342,"[(9, 0.3386442), (11, 0.15148057), (16, 0.0626..."
3,3,2.0,0.3898,"[(0, 0.022723371), (2, 0.38982135), (3, 0.0145..."
4,4,6.0,0.9648,"[(6, 0.9648049)]"
5,5,1.0,0.3237,"[(1, 0.32369223), (9, 0.06912503), (11, 0.2960..."
6,6,7.0,0.3124,"[(2, 0.09105729), (5, 0.10373048), (7, 0.31235..."
7,7,18.0,0.3863,"[(2, 0.11350693), (9, 0.24093445), (11, 0.1152..."
8,8,12.0,0.592,"[(2, 0.21989468), (12, 0.5919897), (18, 0.1630..."
9,9,18.0,0.4413,"[(2, 0.062564865), (5, 0.31170025), (6, 0.0371..."
