In [1]:
import os
import json
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline

In [2]:
filename = "./article_content.json"
with open(filename, 'r') as file:
    data = json.load(file)

In [3]:
# all the news articles are now stored in data
lda_pipeline = Pipeline([('tfidf', TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words='english')),('lda', LatentDirichletAllocation(n_components=38,random_state=42, n_jobs=2)),])

In [4]:
lda_pipeline.fit(data)

In [5]:
tfidf_vectorizer = lda_pipeline.named_steps['tfidf']
lda_model = lda_pipeline.named_steps['lda']

In [6]:
for index,topic in enumerate(lda_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['rapid', 'flexible', 'labour', 'era', 'fema', 'disasters', 'homeowners', 'garcia', 'journalists', 'bowyer', 'automation', 'tournament', 'fractional', 'australian', 'accident']


THE TOP 15 WORDS FOR TOPIC #1
['birds', 'dental', 'fdic', 'messi', 'eastern', 'coo', 'melatonin', 'mil', 'moon', 'residents', 'russian', '737', 'russia', 'ukraine', 'overdraft']


THE TOP 15 WORDS FOR TOPIC #2
['spectator', 'hamdallah', 'ncua', 'pan', 'sauce', 'schwarzman', 'jeep', 'goddard', 'ittihad', 'tas', 'servant', 'waitlist', 'stanley', 'blackstone', 'al']


THE TOP 15 WORDS FOR TOPIC #3
['cricket', 'servers', 'seconds', 'chinese', 'nasal', 'sprays', 'ashes', 'combs', 'procrastination', 'eclipses', 'solar', 'totality', 'bytedance', 'eclipse', 'tiktok']


THE TOP 15 WORDS FOR TOPIC #4
['brex', 'braces', 'apy', 'hints', 'grid', 'penalty', 'nyt', 'theme', 'words', 'puzzle', 'hint', 'cds', 'spangram', 'strands', 'cd']


THE TOP 15 WORDS FOR TOPIC #5
['year', 'rewards', 'rate', 