In [1]:
import pandas as pd
import re
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import torch

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
url = "https://storage.googleapis.com/adsp-nlp-open/data/nlp_a_6_news.json"
news_df = pd.read_json(url, orient="records", lines=True).sample(n=5000, random_state=123)
news_df.head()

Unnamed: 0,url,date,language,title,text
3539,https://www.seattletimes.com/nation-world/fath...,2022-03-04,en,Father mourns son after shelling on Ukraine so...,Father mourns son after shelling on Ukraine so...
1151,http://oaklandnewsnow.com/bright-spot-free-yog...,2022-02-24,en,Bright Spot: Free yoga classes for veterans | ...,Bright Spot: Free yoga classes for veterans | ...
2789,https://finance.yahoo.com/news/liquid-media-pr...,2022-06-04,en,Liquid Media Provides Corporate Update,Liquid Media Provides Corporate Update ...
6205,https://www.benzinga.com/m-a/22/01/25220259/ko...,2022-01-25,en,Kornit Digital Ltd. (NYSE:KRNT) - Kornit Digit...,Kornit Digital Ltd. (NYSE:KRNT) - Kornit Digi...
1367,https://federalnewsnetwork.com/u-s-news/2022/0...,2022-01-21,en,Prosecutor: Alex Murdaugh now faces 75 charges...,Prosecutor: Alex Murdaugh now faces 75 charge...


In [4]:
pd.set_option('display.max_colwidth', None)
news_df["text"].iloc[0]

'Father mourns son after shelling on Ukraine soccer field | The Seattle Times  Skip to contentCoronavirusLocal NewsTraffic LabLaw & JusticeLocal PoliticsEducationEducation LabEastsideEnvironmentHealthDataMental HealthProject HomelessTimes WatchdogBusiness & TechBoeing & AerospaceAmazonMicrosoftTechnologyReal EstateEconomyArtificial IntelligenceNation & WorldNation & World PoliticsOdditiesSportsSeahawksHuskiesCougarsMarinersSoundersStormKrakenHigh School SportsOn TV/RadioEntertainmentMoviesBooksMusicTheaterClassical MusicTV/StreamingComicsGames & PuzzlesHoroscopesLifeFood & DrinkTravel & OutdoorsWellnessPetsRant & RavePacific NW MagazineHomes & Real EstateOpinionEditorialsLetters to the EditorDavid HorseyFree PressVideoPhotographyObituariesNews ObituariesPaid ObituariesNewslettersPrint ReplicaToday’s PaperInside The TimesExploreJobsAutosNewslettersLog InSubscribeNation & WorldCoronavirusLocalBizSportsEntertainmentLifeHomesOpinion|JobsShopExploreNationWorldNation & World PoliticsOddities

In [None]:
def clean_news_article(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'(subscribe|sign in|home|events|advertise|contact|e-edition|privacy|terms of use|customer service|return to top|facebook|twitter|linkedin|email|more news|most popular|currently reading)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(written by|view comments|submit an obituary|print archives|careers|about|©\d{4}.*?|all rights reserved)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(read more|read the full story|read more on this story|read more about this topic|read more from this author|read more from this publication)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(user|cookie|website|description available)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(privacy policy|terms of service|advertising|contact us|about us|site map|accessibility)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(jpg|video|resize|com|https|media|quality|link|new tab|tab|com|subscription)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'(td|margin|color|package|width|font|menu|linux|padding|max width|privacy policy|terms of service|advertising|about us|site map)', '', text, flags=re.IGNORECASE)
    text = re.sub(r'[•|→|«|»]', '', text)
    text = re.sub(r'\b[A-Z][a-z]{2,10}( [A-Z][a-z]{2,10}){1,4}\b(?=\s)', '', text)
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

In [6]:
news_df['cleaned_text'] = news_df['text'].apply(clean_news_article)

news_df["cleaned_text"].iloc[0]

'Father mourns son after shelling on Ukraine soccer field to contentCoronavirusTraffic LabLaw & JusticeLocal PoliticsEducationEducation LabEastsideEnvironmentHealthDataMental HealthProject lessTimes WatchdogBusiness & TechBoeing & AerospaceAmazonMicrosoftTechnologyEconomyArtificial IntelligenceNation & WorldNation & World PoliticsOdditiesSeahawksHuskiesCougarsMarinersSoundersStormKrakenHigh School On TV/RadioEntertainmentMoviesBooksMusicTheaterClassical MusicTV/StreamingComicsGames & PuzzlesHoroscopesLifeFood & DrinkTravel & OuoorsWellnessPetsRant & RavePacific NW Magazines & EditorialsLetters to the EditorDavid HorseyFree PressVideoPhotography Paid sPrint ReplicaToday’s PaperInside The TimesExploreJobsAutossLog InNation & WorldCoronavirusLocalBizEntertainmentLifesJobsShopExploreNationWorldNation & World PoliticsOdditiesAll SectionsLocalCoronavirusTraffic LabProject lessLaw & JusticeLocal PoliticsEducationEducation LabEastsideEnvironmentNorthwesataHealthTimes WatchdogMental HealthInsid

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer("thenlper/gte-small", device=device)
umap_model = UMAP(n_components=20, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=50, metric='euclidean', cluster_selection_method='eom')
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))
ctfidf_model = ClassTfidfTransformer()

In [8]:
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
)

In [9]:
topics, probs = topic_model.fit_transform(news_df["cleaned_text"].tolist())

In [10]:
#topic_model.visualize_topics(topics=topics)

In [11]:
pd.reset_option('display.max_colwidth')
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1440,-1_2022_video_com_jpg,"[2022, video, com, jpg, resize, 07, https, med...",[2022 #NBAAllStar captured on film 🎞📸 Oakland ...
1,0,482,0_new_iphone_pro_apple,"[new, iphone, pro, apple, best, windows, 2022,...",[sales vs Day: deals to buy now and what to wa...
2,1,473,1_shares_nasdaq_stock_nyse,"[shares, nasdaq, stock, nyse, company, equitie...",[Inc. in Inc. (NYSE:HWM) - ETF to main content...
3,2,349,2_said_musictv_musictv streamingcomicsgames_st...,"[said, musictv, musictv streamingcomicsgames, ...",[Head of US has COVID-19 to contentCoronavirus...
4,3,343,3_2022_pm_game_games,"[2022, pm, game, games, new, pc, gaming, xbox,...",[manages to get 30GB/$50 due to new UsSyrupCas...
5,4,330,4_data_cloud_security_new,"[data, cloud, security, new, prwire, 2022, bus...",[DocuSign 2023 Financial ResultsSkip to conten...
6,5,259,5_inflation_fed_canada_bloomberg,"[inflation, fed, canada, bloomberg, rates, mar...","[On Crypto, - BNN BloombergAre you looking for..."
7,6,259,6_nbcchicago com_nbcchicago_jpg_resize,"[nbcchicago com, nbcchicago, jpg, resize, medi...",[kept two foster dogs after crashing his plane...
8,7,171,7_india_2022_covid_said,"[india, 2022, covid, said, rs, live, delhi, ne...",[IAC aircraft warship carrier begins next of s...
9,8,163,8_link opens_link_opens_new tab,"[link opens, link, opens, new tab, tab, opens ...",[This Week's 10 PEOPLE.com Skip to content PEO...


In [12]:
pd.set_option('display.max_colwidth', None)

topic_info = topic_model.get_topic_info()
topic_info[["Topic", "Count", "Name", "Representation"]]

Unnamed: 0,Topic,Count,Name,Representation
0,-1,1440,-1_2022_video_com_jpg,"[2022, video, com, jpg, resize, 07, https, media, 2022 07, quality]"
1,0,482,0_new_iphone_pro_apple,"[new, iphone, pro, apple, best, windows, 2022, tab, opens new, opens]"
2,1,473,1_shares_nasdaq_stock_nyse,"[shares, nasdaq, stock, nyse, company, equities, quarter, rating, llc, price]"
3,2,349,2_said_musictv_musictv streamingcomicsgames_streamingcomicsgames,"[said, musictv, musictv streamingcomicsgames, streamingcomicsgames, justicelocal, politicseducationeducation, radioentertainmentmoviesbooksmusictheaterclassical musictv, radioentertainmentmoviesbooksmusictheaterclassical, justicelocal politicseducationeducation, tv radioentertainmentmoviesbooksmusictheaterclassical]"
4,3,343,3_2022_pm_game_games,"[2022, pm, game, games, new, pc, gaming, xbox, microsoft, 11]"
5,4,330,4_data_cloud_security_new,"[data, cloud, security, new, prwire, 2022, business, customers, technology, gaap]"
6,5,259,5_inflation_fed_canada_bloomberg,"[inflation, fed, canada, bloomberg, rates, market, oil, daily chase, ceo, stocks]"
7,6,259,6_nbcchicago com_nbcchicago_jpg_resize,"[nbcchicago com, nbcchicago, jpg, resize, media nbcchicago, star, 2022, media, quality, https]"
8,7,171,7_india_2022_covid_said,"[india, 2022, covid, said, rs, live, delhi, new, 19, ipl]"
9,8,163,8_link opens_link_opens_new tab,"[link opens, link, opens, new tab, tab, opens new, people, new, people com, subscription]"
