In [1]:
# 20가지 주제의 뉴스 데이터
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

dataset=fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents=dataset.data
len(documents)

11314

In [2]:
documents[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [3]:
# 뉴스 카테고리
print(dataset.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [5]:
news_df=pd.DataFrame({'document': documents})
news_df['clean_doc']=news_df['document'].str.replace("[^a-zA-Z]", " ") # 알파벳 이외의 문자 제거
# 길이가 3 이하인 단어 제거
news_df['clean_doc']=news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
news_df['clean_doc']=news_df['clean_doc'].apply(lambda x: x.lower()) # 소문자 변환


  news_df['clean_doc']=news_df['document'].str.replace("[^a-zA-Z]", " ") # 알파벳 이외의 문자 제거


In [6]:
news_df['clean_doc'][0]

'well sure about story seem biased what disagree with your statement that media ruin israels reputation that rediculous media most israeli media world having lived europe realize that incidences such described letter have occured media whole seem ignore them subsidizing israels existance europeans least same degree think that might reason they report more clearly atrocities what shame that austria daily reports inhuman acts commited israeli soldiers blessing received from government makes some holocaust guilt away after look jews treating other races when they power unfortunate'

In [7]:
from nltk.corpus import stopwords

stop_words=stopwords.words('english')
tokenized_doc=news_df['clean_doc'].apply(lambda x: x.split()) # 토큰화
tokenized_doc=tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words]) # 불용어 제거

In [8]:
print(tokenized_doc[0])

['well', 'sure', 'story', 'seem', 'biased', 'disagree', 'statement', 'media', 'ruin', 'israels', 'reputation', 'rediculous', 'media', 'israeli', 'media', 'world', 'lived', 'europe', 'realize', 'incidences', 'described', 'letter', 'occured', 'media', 'whole', 'seem', 'ignore', 'subsidizing', 'israels', 'existance', 'europeans', 'least', 'degree', 'think', 'might', 'reason', 'report', 'clearly', 'atrocities', 'shame', 'austria', 'daily', 'reports', 'inhuman', 'acts', 'commited', 'israeli', 'soldiers', 'blessing', 'received', 'government', 'makes', 'holocaust', 'guilt', 'away', 'look', 'jews', 'treating', 'races', 'power', 'unfortunate']


In [10]:
# TF-IDF 행렬을 만들기 위해 다시 역토큰화
detokenized_doc=[]
for i in range(len(news_df)):
    t=' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc']=detokenized_doc
news_df['clean_doc'][0]

'well sure story seem biased disagree statement media ruin israels reputation rediculous media israeli media world lived europe realize incidences described letter occured media whole seem ignore subsidizing israels existance europeans least degree think might reason report clearly atrocities shame austria daily reports inhuman acts commited israeli soldiers blessing received government makes holocaust guilt away look jews treating races power unfortunate'

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 상위 1000개의 단어만 처리
vectorizer=TfidfVectorizer(stop_words='english', max_features=1000)
X=vectorizer.fit_transform(news_df['clean_doc'])
X.shape # TF-IDF 행렬의 크기 확인

(11314, 1000)

In [12]:
from sklearn.decomposition import TruncatedSVD
# 행렬 특이값 분해, 11314개의 행을 20개로 축소, n_components : 토픽 수
svd_model=TruncatedSVD(n_components=20)
svd_model.fit(X)
len(svd_model.components_)

20

In [13]:
import numpy as np
# 토픽수 × 단어수
np.shape(svd_model.components_)

(20, 1000)

In [14]:
svd_model.components_

array([[ 0.01469448,  0.05019035,  0.02132607, ...,  0.07865954,
         0.01432356,  0.01788786],
       [-0.00535905,  0.01654529, -0.01646425, ..., -0.06366315,
        -0.01063321, -0.01904601],
       [ 0.0017784 , -0.00369187, -0.01793139, ...,  0.05927518,
         0.02630429,  0.02235225],
       ...,
       [-0.00588063, -0.00051538,  0.00320784, ...,  0.05444411,
        -0.01236747, -0.00061163],
       [ 0.00478448, -0.00386492,  0.00247843, ..., -0.06105495,
        -0.01706505, -0.00657359],
       [-0.00069927,  0.03093023,  0.00785223, ..., -0.01384836,
         0.00432729, -0.00058995]])

In [15]:
# 단어 집합, 1000개의 단어
terms=vectorizer.get_feature_names()

# 20개의 뉴스 그룹별로 추출한 토픽 리스트 출력
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d : " % (idx+1),
        [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])

get_topics(svd_model.components_, terms)
# 각 토픽의 핵심 키워드 추출
# LSA : 쉽고 빠르게 구현이 가능하지만 새로운 데이터가 추가되면 처음부터 다시 계산을 해야 하는 단점이 있음

Topic 1 :  [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2 :  [('thanks', 0.32886), ('windows', 0.29091), ('card', 0.18063), ('drive', 0.1745), ('mail', 0.1511)]
Topic 3 :  [('game', 0.37064), ('team', 0.32451), ('year', 0.28249), ('games', 0.25267), ('season', 0.1839)]
Topic 4 :  [('drive', 0.53403), ('scsi', 0.20037), ('hard', 0.15659), ('disk', 0.15613), ('card', 0.14047)]
Topic 5 :  [('windows', 0.40477), ('file', 0.25042), ('window', 0.18187), ('files', 0.15971), ('program', 0.13984)]
Topic 6 :  [('chip', 0.1612), ('government', 0.16012), ('space', 0.15681), ('mail', 0.15669), ('information', 0.13383)]
Topic 7 :  [('like', 0.66992), ('bike', 0.13975), ('know', 0.11676), ('chip', 0.11286), ('sounds', 0.10372)]
Topic 8 :  [('card', 0.47476), ('video', 0.22503), ('sale', 0.2136), ('monitor', 0.16581), ('offer', 0.1486)]
Topic 9 :  [('know', 0.46102), ('card', 0.33279), ('chip', 0.17094), ('government', 0.15079), ('video', 0.1