In [6]:
import os
import json
import spacy
import pandas as pd
from bertopic import BERTopic
from spacy_download import load_spacy


In [7]:
# Will download the model if it isn't installed yet
nlp = load_spacy("en_core_web_lg") 

In [40]:
df = pd.read_json('./article_content.json', encoding='utf-8')
df.columns = ['raw_text']
df['cleaned_text'] = None
df.head()

Unnamed: 0,raw_text,cleaned_text
0,"In most cases , the clock on the statute of li...",
1,"Topline Travis Scott , a rapper and founder of...",
2,Topline Iran blamed Israel for an airstrike th...,
3,Esprit was once seen as a trendy brand in the ...,
4,The proximity of the publication of David Nich...,


In [42]:
def pre_process_document(text):
    doc = nlp(text)
    filtered_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(filtered_tokens)

In [43]:
# must do some nlp processing to improve the topic modeling
# it looks like the topics generated used a lot of stop words
# aded a lemmatization step for better accuracy
df['cleaned_text'] = df['raw_text'].apply(pre_process_document)

In [44]:
df.head()

Unnamed: 0,raw_text,cleaned_text
0,"In most cases , the clock on the statute of li...",cases clock statute limitations starts running...
1,"Topline Travis Scott , a rapper and founder of...",topline travis scott rapper founder astroworld...
2,Topline Iran blamed Israel for an airstrike th...,topline iran blamed israel airstrike destroyed...
3,Esprit was once seen as a trendy brand in the ...,esprit seen trendy brand 1990s jeffrey greenbe...
4,The proximity of the publication of David Nich...,proximity publication david nicholls sixth nov...


In [45]:
topic_model = BERTopic(embedding_model=nlp)
topics, probs = topic_model.fit_transform(df['cleaned_text'])

In [46]:
fig = topic_model.visualize_topics()
fig.show()

In [47]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,240,-1_data_company_said_new,"[data, company, said, new, work, year, health,...",[getty competitive uncertain market job seeker...
1,0,165,0_team_leaders_work_people,"[team, leaders, work, people, employees, busin...",[young professional starts entry level positio...
2,1,90,1_credit_loan_loans_account,"[credit, loan, loans, account, interest, rates...",[average size 225 square feet tiny homes offer...
3,2,89,2_game_season_ufc_players,"[game, season, ufc, players, win, wwe, league,...",[los angeles california march 24 james harden ...
4,3,82,3_blood_dr_says_symptoms,"[blood, dr, says, symptoms, pressure, weight, ...",[untreated hypotension increase risk health co...
5,4,76,4_business_customers_customer_information,"[business, customers, customer, information, m...",[making sale great better gain loyal customer ...
6,5,51,5_said_kennedy_government_state,"[said, kennedy, government, state, house, mini...",[8h ago 03.57 edt child bitten dingo k’gari ra...
7,6,49,6_hotel_new_cherry_city,"[hotel, new, cherry, city, york, resort, islan...",[record breaking year projected travel tourism...
8,7,42,7_trump_trial_court_case,"[trump, trial, court, case, criminal, presiden...",[topline president donald trump trial beginnin...
9,8,37,8_ai_generative_essay_mental,"[ai, generative, essay, mental, data, use, hea...",[arnold schwarzenegger terminator 1984 photo s...
