In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [14]:
data_path = "../data/"
df = pd.read_csv(data_path + "abcnews-date-text.csv")

print(len(df))
print(df.head(5))

1082168
   publish_date                                      headline_text
0      20030219  aba decides against community broadcasting lic...
1      20030219     act fire witnesses must be aware of defamation
2      20030219     a g calls for infrastructure protection summit
3      20030219           air nz staff in aust strike for pay rise
4      20030219      air nz strike to affect australian travellers


# Data Preprocessing & Tokenizing

In [15]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/godpeny/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
df['headline_text'] = df['headline_text'].apply(lambda row: nltk.word_tokenize(row))
df.head(5)

Unnamed: 0,publish_date,headline_text
0,20030219,"[aba, decides, against, community, broadcastin..."
1,20030219,"[act, fire, witnesses, must, be, aware, of, de..."
2,20030219,"[a, g, calls, for, infrastructure, protection,..."
3,20030219,"[air, nz, staff, in, aust, strike, for, pay, r..."
4,20030219,"[air, nz, strike, to, affect, australian, trav..."


In [17]:
# drop unnecessary columns
df = df.drop(columns=['publish_date'])

# remove stop words
stops = stopwords.words('english')
df['headline_text'] = df['headline_text'].apply(lambda row: [word for word in row if word not in stops])

# lemmatization
lemmatizer = WordNetLemmatizer()
df['headline_text'] = df['headline_text'].apply(lambda row: [lemmatizer.lemmatize(word, pos='v') for word in row]) # pos='v' means parts of speech is verb.

# remove words with length <= 3
tokenized_doc = df['headline_text'].apply(lambda row: [word for word in row if len(word) > 3])
tokenized_doc.head(5)

0       [decide, community, broadcast, licence]
1      [fire, witness, must, aware, defamation]
2    [call, infrastructure, protection, summit]
3                   [staff, aust, strike, rise]
4      [strike, affect, australian, travellers]
Name: headline_text, dtype: object

# TF-IDF Processing

In [18]:
detokenized_doc = []
for i in range(len(df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)
    
df['headline_text'] = detokenized_doc

df.head(5)

Unnamed: 0,headline_text
0,decide community broadcast licence
1,fire witness must aware defamation
2,call infrastructure protection summit
3,staff aust strike rise
4,strike affect australian travellers


In [20]:
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000) # keep top 1000 terms
X = vectorizer.fit_transform(df['headline_text'])

n_topics = 10 # hyperparameter
model = LatentDirichletAllocation(n_components=n_topics, learning_method='online', random_state=777, max_iter=1)
history = model.fit_transform(X)

### LatentDirichletAllocation.learning_method
 - 'batch': Batch variational Bayes method. Use all training data in each EM update. Old `components_` will be overwritten in each iteration.
 - 'online': Online variational Bayes method. In each EM update, use mini-batch of training data to update the ``components_`` variable incrementally. 
    The learning rate is controlled by the ``learning_decay`` and the ``learning_offset`` parameters.
### EM(Expectation and Maximization) algorithm
 - In statistics, an expectation–maximization (EM) algorithm is an iterative method to find (local) maximum likelihood or maximum a posteriori (MAP) estimates of parameters in statistical models, where the model depends on unobserved latent variables.

In [26]:
print(model.components_.shape)
print(model.components_)

(10, 1000)
[[1.00001533e-01 1.00001269e-01 1.00004179e-01 ... 1.00006124e-01
  1.00003111e-01 1.00003064e-01]
 [1.00001199e-01 1.13513398e+03 3.50170830e+03 ... 1.00009349e-01
  1.00001896e-01 1.00002937e-01]
 [1.00001811e-01 1.00001151e-01 1.00003566e-01 ... 1.00002693e-01
  1.00002061e-01 7.53381835e+02]
 ...
 [1.00001065e-01 1.00001689e-01 1.00003278e-01 ... 1.00006721e-01
  1.00004902e-01 1.00004759e-01]
 [1.00002401e-01 1.00000732e-01 1.00002989e-01 ... 1.00003517e-01
  1.00001428e-01 1.00005266e-01]
 [1.00003427e-01 1.00002313e-01 1.00007340e-01 ... 1.00003732e-01
  1.00001207e-01 1.00005153e-01]]


In [28]:
terms = vectorizer.get_feature_names_out() # 1000 words

def get_topics(components, terms, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(terms[i], topic[i].round(2)) for i in topic.argsort()[:-n - 1:-1]])

In [30]:
get_topics(model.components_, terms)

Topic 1: [('government', 8725.19), ('sydney', 8393.29), ('queensland', 7720.12), ('change', 5874.27), ('home', 5674.38)]
Topic 2: [('australia', 13691.08), ('australian', 11088.95), ('melbourne', 7528.43), ('world', 6707.7), ('south', 6677.03)]
Topic 3: [('death', 5935.06), ('interview', 5924.98), ('kill', 5851.6), ('jail', 4632.85), ('life', 4275.27)]
Topic 4: [('house', 6113.49), ('2016', 5488.19), ('state', 4923.41), ('brisbane', 4857.21), ('tasmania', 4610.97)]
Topic 5: [('court', 7542.74), ('attack', 6959.64), ('open', 5663.0), ('face', 5193.63), ('warn', 5115.01)]
Topic 6: [('market', 5545.86), ('rural', 5502.89), ('plan', 4828.71), ('indigenous', 4223.4), ('power', 3968.26)]
Topic 7: [('charge', 8428.8), ('election', 7561.63), ('adelaide', 6758.36), ('make', 5658.99), ('test', 5062.69)]
Topic 8: [('police', 12092.44), ('crash', 5281.14), ('drug', 4290.87), ('beat', 3257.58), ('rise', 2934.92)]
Topic 9: [('fund', 4693.03), ('labor', 4047.69), ('national', 4038.68), ('council', 40