Refactoring current clustering to better group articles into topics on the daily news dashboard
Attempting to use BERTopic which relies on HDBSCAN

In [12]:
import pandas as pd
import json
from bertopic import BERTopic
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
pd.set_option("display.max_colwidth", 200)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jared\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# load articles
with open('../data/articles_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    article_data = pd.DataFrame(data)
    print(f"Loaded {len(article_data)} articles.")

Loaded 1166 articles.


In [14]:
# drop duplicates from crawl code
article_data = article_data.drop_duplicates(subset='url', keep='first')
article_data = article_data.drop_duplicates(subset=['title', 'source'], keep='first')
print(f"Dropped duplicates. {len(article_data)} articles remaining.")

Dropped duplicates. 545 articles remaining.


In [15]:
# clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

article_data['cleaned_content'] = article_data['content'].apply(clean_text)
article_data['snippet'] = article_data['title'] + " " + article_data['cleaned_content'].str[:300]


In [16]:
# now apply bertopic
vectorizer_model = CountVectorizer(ngram_range=(1, 2))
topic_model = BERTopic(
    calculate_probabilities=True,
    min_topic_size=5,
    vectorizer_model=vectorizer_model
)

topics, probs = topic_model.fit_transform(article_data['snippet'])
article_data["topic"] = topics

In [17]:
topic_labels = topic_model.generate_topic_labels(nr_words=5, separator=" ")
topic_model.set_topic_labels(topic_labels)
topic_info = topic_model.get_topic_info()

In [18]:
print(topic_info)

    Topic  Count                                              Name  \
0      -1    114                   -1_new_trump_the_administration   
1       0     53                    0_newsletter_book_the_atlantic   
2       1     50                         1_time_dear_life_question   
3       2     43                  2_ukraine_russia_trump_president   
4       3     32                     3_musk_elon_elon musk_federal   
5       4     28               4_court_supreme_supreme court_judge   
6       5     22             5_trump_donald_donald trump_president   
7       6     19           6_germany_merz_friedrich_friedrich merz   
8       7     17                           7_award_actor_sag_oscar   
9       8     16                        8_police_mexico_said_texas   
10      9     14                              9_bird_flu_egg_virus   
11     10     13                  10_election_the_america_woodruff   
12     11     13  11_park_national park_yosemite_yosemite national   
13     12     11    

In [19]:
# def remove_articles_by_source(input_file, output_file, source_to_remove):
#     # Read the JSON data
#     with open(input_file, 'r', encoding='utf-8') as f:
#         articles_data = json.load(f)
    
#     # Filter out articles with the specified source
#     filtered_articles = [
#         article for article in articles_data if article.get('source') != source_to_remove
#     ]
    
#     # Save the cleaned data back to a new file (or overwrite the original file)
#     with open(output_file, 'w', encoding='utf-8') as f:
#         json.dump(filtered_articles, f, indent=4, ensure_ascii=False)
    
#     print(f"Removed articles from source: {source_to_remove}")
#     print(f"Remaining articles: {len(filtered_articles)}")

# path = r"C:\Users\jared\OneDrive\Desktop\bitewise\python-api\data\articles_data.json"
# source_to_remove = 'https://arstechnica.com'
# remove_articles_by_source(path, path, source_to_remove)

In [34]:
top_15_topics = topic_info.sort_values(by="Count", ascending=False).index

filtered_topics = []
crossword_words = ['newsletter', 'book', 'atlantic', 'best', 'the', 'weekday', 'puzzle', 'new york', 'crossword']   
ad_words = ['ad', 'video', 'content', 'video content', 'loading video', 'ad audio', 'relevant ad', 'advertisement']
advice_words = ['time', 'dear', 'life', 'question', 'advice', 'prudence']
bad_sets = [crossword_words, ad_words, advice_words]
for topic in top_15_topics:

    # eliminate topics where most articles are from same source
    articles_in_topic = article_data[article_data["topic"] == topic]
    source_counts = articles_in_topic["source"].value_counts(normalize=True)
    if source_counts.max() >= 0.5:
        print(f"Removed topic: {topic} due to same source bias")
        continue

    # eliminate topics with garbage key words
    topic_representation = topic_info[topic_info['Topic'] == topic]['Representation'].values
    if len(topic_representation) > 0:
        topic_words = set(topic_representation[0])
        remove_topic = False
        for bad_set in bad_sets:
            unwanted_count = len(topic_words.intersection(bad_set))
            if unwanted_count >= 4:
                remove_topic = True
                print(f"Removed topic {topic}: {topic_words}, due to garbage key words")
                break

        if remove_topic:
            continue
        

    filtered_topics.append(topic)
print("Filtered Topics:", filtered_topics)

Removed topic 0: {'atlantic', 'the', 'life', 'see', 'morning', 'new', 'sign', 'book', 'newsletter', 'best'}, due to garbage key words
Removed topic 1: {'life', 'question', 'time', 'slate', 'prudence', 'dear', 'cancer', 'my', 'advice', 'year'}, due to garbage key words
Removed topic: 12 due to same source bias
Removed topic: 17 due to same source bias
Removed topic: 18 due to same source bias
Removed topic: 22 due to same source bias
Removed topic: 23 due to same source bias
Removed topic: 24 due to same source bias
Filtered Topics: [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 13, 14, 16, 19, 20, 21, 25]


In [35]:
representative_articles = {}
# Retrieve representative articles for each top 15 topic
for topic in filtered_topics[:15]:
    representative_docs = topic_model.get_representative_docs(topic)
    rep_articles_info = article_data[article_data['snippet'].isin(representative_docs)][['title', 'url', 'source', 'topic']]
    representative_articles[topic] = rep_articles_info

# Print representative articles and their count for each topic
for topic, articles_in_topic in representative_articles.items():
    print(f"\n🔹 Topic {topic}")
    print(articles_in_topic[['title', 'url']].head())


🔹 Topic 2
                                                                                                title  \
175         Ukraine marks 3 years since Russia's invasion amid concern about Trump's stand on the war   
179  U.N. rejects U.S. resolution that urged end to Ukraine war without mentioning Russian aggression   
186             US joins Russia to vote against UN resolution condemning Russia’s war against Ukraine   

                                                                                     url  
175      https://www.cbsnews.com//news/ukraine-3-year-russia-invasion-anniversary-trump/  
179                    https://www.cbsnews.com/news/un-resolution-us-ukraine-russia-war/  
186  https://www.cnn.com//2025/02/24/politics/us-joins-russia-ukraine-un-vote/index.html  

🔹 Topic 3
                                                                                                    title  \
11                   Federal workers sue over Musk's threat to fire if they don't ex

In [None]:
rep_article_urls = {
    article["url"]
    for articles in representative_articles.values()
    for _, article in articles.iterrows()
}
rep_article_urls

In [41]:
# Define the fields to keep
fields_to_keep = {"url", "title", "source", "content", "imageUrl", "authors", "time"}

cluster_groups = {}

# Organize articles by cluster
for _, article in article_data.iterrows():
    cluster_id = article["topic"]
    if cluster_id not in cluster_groups:
        cluster_groups[cluster_id] = []
    
    # Filter article fields
    filtered_article = {key: article[key] for key in fields_to_keep if key in article}
    cluster_groups[cluster_id].append(filtered_article)

# Ensure the first three articles per cluster are representatives
for cluster_id, articles in cluster_groups.items():
    rep_for_cluster = [article for article in articles if article["url"] in rep_article_urls]
    non_rep_articles = [article for article in articles if article["url"] not in rep_article_urls]

    # Keep representatives first, followed by others
    cluster_groups[cluster_id] = rep_for_cluster[:3] + non_rep_articles

# Return only clustered_articles
response = {
    "clustered_articles": cluster_groups
}
cluster_groups


{5: [{'imageUrl': 'https://www.vice.com/wp-content/uploads/sites/2/2025/01/donald-trump-pulls-the-united-states-out-of-who.jpg?w=1200',
   'authors': ['Kyle Phillippi',
    'Stephen Andrew Galiher',
    'Sammi Caramela',
    'Shaun Cichacki',
    'Luis Prada',
    '.Wp-Block-Savage-Platform-Post-Byline Font-Family Var --Wp--Custom--Typography--Font-Family--Secondary',
    'Font-Size Var --Wp--Custom--Typography--Font-Size--Sm',
    'Font-Weight Var --Wp--Custom--Typography--Font-Weight--Bold',
    'Line-Height Var --Wp--Custom--Typography--Line-Height--Xxs',
    '.Wp-Block-Savage-Platform-Post-Byline A Font-Weight Var --Wp--Custom--Typography--Font-Weight--Black'],
   'source': 'https://news.vice.com/',
   'url': 'https://www.vice.com/en/article/donald-trump-announces-us-withdrawal-from-world-health-organization/',
   'title': 'Donald Trump Announces US Withdrawal From World Health Organization',
   'time': '2025-01-21T15:28:15',
   'content': 'President Donald Trump utilized his first