In [1]:
import pandas as pd
import urllib.request

urllib.request.urlretrieve("https://raw.githubusercontent.com/franciscadias/data/master/abcnews-date-text.csv", filename="abcnews-date-text.csv")
data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False)
print(len(data))
data

1082168


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
...,...,...
1082163,20170630,when is it ok to compliment a womans smile a g...
1082164,20170630,white house defends trumps tweet
1082165,20170630,winter closes in on tasmania as snow ice falls
1082166,20170630,womens world cup australia wins despite atapat...


In [2]:
text = data[["headline_text"]].copy()

In [3]:
import nltk

text["headline_text"] = text.apply(lambda row: nltk.word_tokenize(row["headline_text"]), axis=1)

In [4]:
from nltk.corpus import stopwords

stop = stopwords.words("english")
text["headline_text"] = text["headline_text"].apply(lambda x : [word for word in x if word not in stop])

In [5]:
print(text.head(5))

                                       headline_text
0   [aba, decides, community, broadcasting, licence]
1    [act, fire, witnesses, must, aware, defamation]
2     [g, calls, infrastructure, protection, summit]
3          [air, nz, staff, aust, strike, pay, rise]
4  [air, nz, strike, affect, australian, travellers]


In [9]:
from nltk.stem import WordNetLemmatizer
text["headline_text"] = text["headline_text"].apply(lambda x : [WordNetLemmatizer().lemmatize(word, pos="v") for word in x])
text.head(5)

Unnamed: 0,headline_text
0,"[aba, decide, community, broadcast, licence]"
1,"[act, fire, witness, must, aware, defamation]"
2,"[g, call, infrastructure, protection, summit]"
3,"[air, nz, staff, aust, strike, pay, rise]"
4,"[air, nz, strike, affect, australian, travellers]"


In [11]:
tokenize_doc = text["headline_text"].apply(lambda x : [word for word in x if len(word) > 3])
tokenize_doc.head(5)

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object

In [12]:
detokenized_doc = []
for token in tokenize_doc:
    detokenized_doc.append(" ".join(token))

text["headline_text"] = detokenized_doc

print(text.head(5))

                           headline_text
0     decide community broadcast licence
1     fire witness must aware defamation
2  call infrastructure protection summit
3                 staff aust strike rise
4    strike affect australian travellers


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english", max_features = 1000)
X = vectorizer.fit_transform(text["headline_text"])


(1082168, 1000)


In [17]:
from sklearn.decomposition import LatentDirichletAllocation

# 토픽이 10개
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=777,max_iter=1)

In [18]:
lda_top = lda_model.fit_transform(X)

In [25]:
print(X[0])

  (0, 521)	0.6080934005080666
  (0, 191)	0.503549348860692
  (0, 255)	0.6137267058883044


In [27]:
# 각 토픽에 가장 가까운 단어
terms = vectorizer.get_feature_names()

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(lda_model.components_,terms)

Topic 1: [('government', 8725.19), ('sydney', 8393.29), ('queensland', 7720.12), ('change', 5874.27), ('home', 5674.38)]
Topic 2: [('australia', 13691.08), ('australian', 11088.95), ('melbourne', 7528.43), ('world', 6707.7), ('south', 6677.03)]
Topic 3: [('death', 5935.06), ('interview', 5924.98), ('kill', 5851.6), ('jail', 4632.85), ('life', 4275.27)]
Topic 4: [('house', 6113.49), ('2016', 5488.19), ('state', 4923.41), ('brisbane', 4857.21), ('tasmania', 4610.97)]
Topic 5: [('court', 7542.74), ('attack', 6959.64), ('open', 5663.0), ('face', 5193.63), ('warn', 5115.01)]
Topic 6: [('market', 5545.86), ('rural', 5502.89), ('plan', 4828.71), ('indigenous', 4223.4), ('power', 3968.26)]
Topic 7: [('charge', 8428.8), ('election', 7561.63), ('adelaide', 6758.36), ('make', 5658.99), ('test', 5062.69)]
Topic 8: [('police', 12092.44), ('crash', 5281.14), ('drug', 4290.87), ('beat', 3257.58), ('rise', 2934.92)]
Topic 9: [('fund', 4693.03), ('labor', 4047.69), ('national', 4038.68), ('council', 40