In [1]:
### topic modeling
from sklearn.datasets import fetch_20newsgroups
dataset=fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers','footers','quotes'))

In [2]:
documents=dataset.data

In [3]:
type(documents)
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [4]:
print(dataset.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
### 각 topic과 가장 관련성 높은 단어 10개씩 출력

In [6]:
import pandas as pd
newsdf=pd.DataFrame({'document':documents})

In [7]:
# 알파벳 제외하고 모두 제거
newsdf['clean_doc']=newsdf['document'].str.replace("[^a-zA-Z]"," ")

In [8]:
# 3글자 이하 단어 제거
newsdf['clean_doc']=newsdf['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [9]:
# 대문자 -> 소문자
newsdf['clean_doc']=newsdf['clean_doc'].apply(lambda x: x.lower())

In [10]:
# 단어 토큰화 후 불용어 제거
from nltk.corpus import stopwords
stopwords=stopwords.words('english')

news_tk=newsdf['clean_doc'].apply(lambda x:x.split())
news_tk=news_tk.apply(lambda x: [item for item in x if item not in stopwords])
news_tk[1]

['yeah',
 'expect',
 'people',
 'read',
 'actually',
 'accept',
 'hard',
 'atheism',
 'need',
 'little',
 'leap',
 'faith',
 'jimmy',
 'logic',
 'runs',
 'steam',
 'sorry',
 'pity',
 'sorry',
 'feelings',
 'denial',
 'faith',
 'need',
 'well',
 'pretend',
 'happily',
 'ever',
 'anyway',
 'maybe',
 'start',
 'newsgroup',
 'atheist',
 'hard',
 'bummin',
 'much',
 'forget',
 'flintstone',
 'chewables',
 'bake',
 'timmons']

In [11]:
# tf-idf matrix 생성을  위해 역토큰화
news_detk=[]
for i in range(len(newsdf)):
    temp=' '.join(news_tk[i])
    news_detk.append(temp)
newsdf['clean_doc']=news_detk
newsdf['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons'

In [12]:
#  tf_idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix=tfidf.fit_transform(newsdf['clean_doc'])
tfidf_matrix

<11314x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 250501 stored elements in Compressed Sparse Row format>

In [13]:
# 특이값 분해 (SVD)
from sklearn.decomposition import TruncatedSVD
#  topic 갯수 지정
svd=TruncatedSVD(n_components=20)
svd.fit(tfidf_matrix)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=5,
             random_state=None, tol=0.0)

In [14]:
import numpy as np

# VT 구하기
np.shape(svd.components_)

(20, 1000)

In [15]:
# 1000개의 feature 추출
terms=tfidf.get_feature_names()
terms

['ability',
 'able',
 'accept',
 'access',
 'according',
 'account',
 'action',
 'actions',
 'actual',
 'actually',
 'added',
 'addition',
 'additional',
 'address',
 'administration',
 'advance',
 'advice',
 'agencies',
 'agree',
 'algorithm',
 'allow',
 'allowed',
 'allows',
 'amendment',
 'america',
 'american',
 'americans',
 'analysis',
 'angeles',
 'anonymous',
 'answer',
 'answers',
 'anti',
 'anybody',
 'apparently',
 'appear',
 'appears',
 'apple',
 'application',
 'applications',
 'apply',
 'appreciate',
 'appreciated',
 'approach',
 'appropriate',
 'april',
 'arab',
 'archive',
 'area',
 'areas',
 'argument',
 'arguments',
 'armenia',
 'armenian',
 'armenians',
 'arms',
 'army',
 'article',
 'articles',
 'asked',
 'asking',
 'assume',
 'assuming',
 'atheism',
 'atheists',
 'attack',
 'attempt',
 'author',
 'authority',
 'available',
 'average',
 'avoid',
 'away',
 'background',
 'base',
 'baseball',
 'based',
 'basic',
 'basically',
 'basis',
 'begin',
 'beginning',
 'belief

In [16]:
def get_topic(c, fname, n=10):
    for  i,t  in enumerate(c):
        print('topic %d : ' % (i+1),[(fname[i], t[i].round(5)) for i in t.argsort()[:-n-1:-1]])
 
get_topic(svd.components_, terms)

topic 1 :  [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128), ('time', 0.14446), ('thanks', 0.11628), ('make', 0.10882), ('right', 0.10738), ('want', 0.10442)]
topic 2 :  [('thanks', 0.32882), ('windows', 0.29118), ('card', 0.18053), ('drive', 0.17446), ('mail', 0.1509), ('file', 0.14611), ('advance', 0.12511), ('files', 0.11486), ('software', 0.11358), ('program', 0.10561)]
topic 3 :  [('game', 0.37179), ('team', 0.32419), ('year', 0.28057), ('games', 0.25463), ('season', 0.18401), ('players', 0.15949), ('good', 0.15833), ('play', 0.15085), ('hockey', 0.13771), ('league', 0.11958)]
topic 4 :  [('drive', 0.53438), ('scsi', 0.19973), ('hard', 0.15635), ('disk', 0.1562), ('card', 0.14124), ('drives', 0.13842), ('problem', 0.1126), ('controller', 0.10207), ('floppy', 0.09648), ('power', 0.07542)]
topic 5 :  [('windows', 0.40409), ('file', 0.25788), ('window', 0.18171), ('files', 0.15699), ('program', 0.13329), ('using', 0.12877), ('problem',