In [5]:
import os
import json
import spacy
import pandas as pd
from bertopic import BERTopic
from spacy_download import load_spacy


In [6]:
# Will download the model if it isn't installed yet
spacy.require_gpu()
nlp = load_spacy("en_core_web_lg") 

In [7]:
df = pd.read_json('./article_content.json', encoding='utf-8')
df.columns = ['raw_text']
df['cleaned_text'] = None
df.head()

Unnamed: 0,raw_text,cleaned_text
0,"In most cases , the clock on the statute of li...",
1,"Topline Travis Scott , a rapper and founder of...",
2,Topline Iran blamed Israel for an airstrike th...,
3,Esprit was once seen as a trendy brand in the ...,
4,The proximity of the publication of David Nich...,


In [13]:
def pre_process_document(text):
    doc = nlp(text)
    filtered_tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(filtered_tokens)

In [14]:
# must do some nlp processing to improve the topic modeling
# it looks like the topics generated used a lot of stop words
# aded a lemmatization step for better accuracy
df['cleaned_text'] = df['raw_text'].apply(pre_process_document)

KeyError: 'raw_text'

In [None]:
df.head()

Unnamed: 0,raw_text,cleaned_text
0,"In most cases , the clock on the statute of li...",case clock statute limitation start run day in...
1,"Topline Travis Scott , a rapper and founder of...",Topline Travis Scott rapper founder Astroworld...
2,Topline Iran blamed Israel for an airstrike th...,topline Iran blame Israel airstrike destroy co...
3,Esprit was once seen as a trendy brand in the ...,Esprit see trendy brand 1990 Jeffrey Greenberg...
4,The proximity of the publication of David Nich...,proximity publication David Nicholls sixth nov...


In [None]:
topic_model = BERTopic(embedding_model=nlp, nr_topics=38)
topics, probs = topic_model.fit_transform(df['cleaned_text'])

In [None]:
fig = topic_model.visualize_topics()
fig.show()

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,277,-1_company_work_year_business,"[company, work, year, business, say, million, ...",[getty competitive uncertain market job seeker...
1,0,19,0_word_hint_today_spangram,"[word, hint, today, spangram, answer, group, s...",[today NYT Strands hint answer credit New York...
2,1,114,1_credit_loan_card_account,"[credit, loan, card, account, rate, fee, cd, d...",[repair replace roof major expense homeowner c...
3,2,79,2_blood_dr_symptom_pressure,"[blood, dr, symptom, pressure, say, treatment,...",[untreated hypotension increase risk health co...
4,3,24,3_chart_album_billboard_song,"[chart, album, billboard, song, band, week, ta...",[NEW YORK NY JULY 29 L R keyboard player Jonat...
5,4,15,4_climate_record_temperature_researcher,"[climate, record, temperature, researcher, cha...",[Topline Heat wave propel climate change cost ...
6,5,31,5_city_eclipse_travel_trip,"[city, eclipse, travel, trip, rank, number, li...",[look good place live United States cheap plac...
7,6,84,6_game_win_player_season,"[game, win, player, season, play, ufc, team, m...",[LOS ANGELES CALIFORNIA MARCH 24 James Harden ...
8,7,42,7_trump_trial_case_president,"[trump, trial, case, president, court, crimina...",[topline President Donald Trump trial begin Ap...
9,8,33,8_government_say_state_bill,"[government, say, state, bill, minister, party...",[welcome reader Afternoon Update Australia pou...


In [10]:
df = pd.read_json('./top_15_per_category.json', encoding='utf-8')

In [11]:
# get the top 15 words per topic and see how it does with the trained data topics
topic_info = topic_model.get_topic_info()  
top_words_per_topic = [topic_model.get_topic(topic) for topic in range(topic_info.topic.nunique()) if topic != -1] 
top_15_words = {topic: words[:15] for topic, words in top_words_per_topic if words is not None}

NameError: name 'topic_model' is not defined