In [1]:
import pandas as pd
import re
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [15]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

print('항목 : ', dataset.keys())
print('샘플의 수 : ',len(documents))
print('카테고리 : ', dataset.target_names) # 20 categories

항목 :  dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])
샘플의 수 :  11314
카테고리 :  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


# Data Preprocessing & Tokenizing

In [3]:
df = pd.DataFrame({'document':documents})
print(df.head(3))

df['clean_doc'] = df['document'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
print(df.clean_doc[:3])

df['clean_doc'] = df['clean_doc'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 3]))
print(df.clean_doc[:3])

df['clean_doc'] = df['clean_doc'].apply(lambda x: x.lower())
print(df.clean_doc[:3])

                                            document
0  Well i'm not sure about the story nad it did s...
1  \n\n\n\n\n\n\nYeah, do you expect people to re...
2  Although I realize that principle is not one o...
0    Well i m not sure about the story nad it did s...
1           Yeah  do you expect people to read the ...
2    Although I realize that principle is not one o...
Name: clean_doc, dtype: object
0    Well sure about story seem biased What disagre...
1    Yeah expect people read actually accept hard a...
2    Although realize that principle your strongest...
Name: clean_doc, dtype: object
0    well sure about story seem biased what disagre...
1    yeah expect people read actually accept hard a...
2    although realize that principle your strongest...
Name: clean_doc, dtype: object


### join
 - Concatenate any number of strings.
 ```python
    '.'.join(['ab', 'pq', 'rs']) # 'ab.pq.rs'
 ```

In [4]:
print(df['clean_doc'][1])

yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons


In [7]:
stop = stopwords.words('english')
tokenized_doc = df['clean_doc'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [word for word in x if word not in stop])

print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


# TF-IDF Processing

In [10]:
detokenized_doc = []

for i in range(len(tokenized_doc)):
    sentence_from_tokens= ' '.join(tokenized_doc[i])
    detokenized_doc.append(sentence_from_tokens)
    
df['clean_doc'] = detokenized_doc

print(df['clean_doc'][1])

yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons


In [11]:
# max_df : ignore terms that have a document frequency(df) strictly higher than the given threshold. In this case, 0.5 means "ignore terms that appear in more than 50% of the documents".
# max_features : build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. In this case, pick top 1000 words ordered by term frequency.
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, max_df=0.5, smooth_idf=True)
X = vectorizer.fit_transform(df['clean_doc'])

print(X.shape)

(11314, 1000)


# Topic Modeling

In [17]:
# n_components : number of topics = 20 categories of news datas
svd_model = TruncatedSVD(n_components=len(dataset.target_names), algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(X)

In [23]:
# check VT and words
print(svd_model.components_.shape) # VT = (20, 1000) = (n_components, n_words)

terms = vectorizer.get_feature_names_out()
print(terms[:10])

(20, 1000)
['ability' 'able' 'accept' 'access' 'according' 'account' 'action'
 'actions' 'actual' 'actually']


In [44]:
def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n-1:-1]])

### [a:b:c] : a부터 b까지 c의 간격으로
```python
lst = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
n = 3
print(lst[:-n-1:-1]) # [9, 8, 7]
print(lst[0:5:2]) # [0, 2, 4]
print(lst[:-3]) # [0, 1, 2, 3, 4, 5, 6]
```

In [45]:
get_topics(svd_model.components_, terms)

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10