Refactoring current clustering to better group articles into topics on the daily news dashboard
Attempting to use BERTopic which relies on HDBSCAN

In [157]:
import pandas as pd
import json
from bertopic import BERTopic
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
pd.set_option("display.max_colwidth", 200)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jared\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [158]:
# load articles
with open('../data/articles_data.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    article_data = pd.DataFrame(data)
    print(f"Loaded {len(article_data)} articles.")

Loaded 1166 articles.


In [159]:
# drop duplicates from crawl code
article_data = article_data.drop_duplicates(subset='url', keep='first')
article_data = article_data.drop_duplicates(subset=['title', 'source'], keep='first')
print(f"Dropped duplicates. {len(article_data)} articles remaining.")

Dropped duplicates. 545 articles remaining.


In [160]:
# clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

article_data['cleaned_content'] = article_data['content'].apply(clean_text)
article_data['snippet'] = article_data['title'] + " " + article_data['cleaned_content'].str[:300]


In [161]:
# now apply bertopic
vectorizer_model = CountVectorizer(ngram_range=(1, 2))
topic_model = BERTopic(
    calculate_probabilities=True,
    min_topic_size=5,
    vectorizer_model=vectorizer_model
)

topics, probs = topic_model.fit_transform(article_data['snippet'])
article_data["topic"] = topics

In [162]:
topic_labels = topic_model.generate_topic_labels(nr_words=5, separator=" ")
topic_model.set_topic_labels(topic_labels)
topic_info = topic_model.get_topic_info()

In [163]:
print(topic_info)

    Topic  Count                                       Name  \
0      -1    151                        -1_the_new_time_one   
1       0     42           0_ukraine_russia_trump_president   
2       1     34              1_musk_elon_federal_elon musk   
3       2     30      2_trump_donald_donald trump_president   
4       3     28                   3_dear_my_question_slate   
5       4     23       4_newsletter_atlantic_weekday_puzzle   
6       5     17              5_police_mexico_texas_newborn   
7       6     16                    6_award_actor_sag_oscar   
8       7     16    7_germany_merz_friedrich_friedrich merz   
9       8     14            8_america_the_election_woodruff   
10      9     14     9_abortion_supreme_supreme court_court   
11     10     13                 10_tv_show_baldwin_reality   
12     11     12                    11_ai_robot_human_model   
13     12     11          12_flack_roberta_roberta flack_88   
14     13     11                   13_flu_egg_bird_bird

In [164]:
# def remove_articles_by_source(input_file, output_file, source_to_remove):
#     # Read the JSON data
#     with open(input_file, 'r', encoding='utf-8') as f:
#         articles_data = json.load(f)
    
#     # Filter out articles with the specified source
#     filtered_articles = [
#         article for article in articles_data if article.get('source') != source_to_remove
#     ]
    
#     # Save the cleaned data back to a new file (or overwrite the original file)
#     with open(output_file, 'w', encoding='utf-8') as f:
#         json.dump(filtered_articles, f, indent=4, ensure_ascii=False)
    
#     print(f"Removed articles from source: {source_to_remove}")
#     print(f"Remaining articles: {len(filtered_articles)}")

# path = r"C:\Users\jared\OneDrive\Desktop\bitewise\python-api\data\articles_data.json"
# source_to_remove = 'https://arstechnica.com'
# remove_articles_by_source(path, path, source_to_remove)

In [165]:
print(topic_info['Representation'])

0                                                                           [the, new, time, one, to, of, year, trump, news, is]
1                                         [ukraine, russia, trump, president, war, ukrainian, invasion, putin, kyiv, resolution]
2                                              [musk, elon, federal, elon musk, employee, email, week, worker, last week, trump]
3               [trump, donald, donald trump, president, house, president donald, executive order, executive, order, washington]
4                                      [dear, my, question, slate, dear prudence, prudence, advice column, advice, column, year]
5            [newsletter, atlantic, weekday, puzzle, see, sign, morning, newsletter subscription, every weekday, privacy policy]
6                                                        [police, mexico, texas, newborn, said, migrant, guard, boat, state, in]
7                                               [award, actor, sag, oscar, film, screen actor, de

In [None]:
top_15_topics = topic_info.sort_values(by="Count", ascending=False).index

filtered_topics = []
crossword_words = ['newsletter', 'book', 'atlantic', 'best', 'the', 'weekday', 'puzzle', 'new york', 'crossword']   
ad_words = ['ad', 'video', 'content', 'video content', 'loading video', 'ad audio', 'relevant ad', 'advertisement']
bad_sets = [crossword_words, ad_words]
for topic in top_15_topics:

    # eliminate topics where most articles are from same source
    articles_in_topic = article_data[article_data["topic"] == topic]
    source_counts = articles_in_topic["source"].value_counts(normalize=True)
    if source_counts.max() >= 0.5:
        print(f"Removed topic {topic}: {topic_words}, due to same source bias")
        continue

    # eliminate topics with garbage key words
    topic_representation = topic_info[topic_info['Topic'] == topic]['Representation'].values
    if len(topic_representation) > 0:
        topic_words = set(topic_representation[0])
        remove_topic = False
        for bad_set in bad_sets:
            unwanted_count = len(topic_words.intersection(bad_set))
            if unwanted_count >= 4:
                remove_topic = True
                print(f"Removed topic {topic}: {topic_words}, due to garbage key words")
            break

        if remove_topic:
            continue

    filtered_topics.append(topic)
    print("Filtered Topics:", filtered_topics)

Filtered Topics: [0]
Filtered Topics: [0, 1]
Filtered Topics: [0, 1, 2]
Filtered Topics: [0, 1, 2, 3]
Removed topic 4: {'question', 'dear', 'prudence', 'advice column', 'dear prudence', 'advice', 'my', 'column', 'year', 'slate'}, due to same source bias
Filtered Topics: [0, 1, 2, 3, 5]
Filtered Topics: [0, 1, 2, 3, 5, 6]
Filtered Topics: [0, 1, 2, 3, 5, 6, 7]
Filtered Topics: [0, 1, 2, 3, 5, 6, 7, 8]
Filtered Topics: [0, 1, 2, 3, 5, 6, 7, 8, 9]
Filtered Topics: [0, 1, 2, 3, 5, 6, 7, 8, 9, 10]
Filtered Topics: [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11]
Filtered Topics: [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12]
Removed topic 15: {'88', 'roberta flack', 'killing', 'softly', 'singer', 'roberta', 'died', 'grammy', 'song', 'flack'}, due to same source bias
Filtered Topics: [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 16]


In [172]:
representative_articles = {}
# Retrieve representative articles for each top 15 topic
for topic in filtered_topics[:15]:
    representative_docs = topic_model.get_representative_docs(topic)
    rep_articles_info = article_data[article_data['snippet'].isin(representative_docs)][['title', 'url', 'source', 'topic']]
    representative_articles[topic] = rep_articles_info

# Print representative articles and their count for each topic
for topic, articles_in_topic in representative_articles.items():
    print(f"\n🔹 Topic {topic}")
    print(articles_in_topic[['title', 'url']].head())


🔹 Topic 0
                                                                                                     title  \
179       U.N. rejects U.S. resolution that urged end to Ukraine war without mentioning Russian aggression   
186                  US joins Russia to vote against UN resolution condemning Russia’s war against Ukraine   
499  UN rejects U.S. resolution urging an end to the war in Ukraine without mentioning Moscow’s aggression   

                                                                                                                                        url  
179                                                                       https://www.cbsnews.com/news/un-resolution-us-ukraine-russia-war/  
186                                                     https://www.cnn.com//2025/02/24/politics/us-joins-russia-ukraine-un-vote/index.html  
499  https://www.pbs.org/newshour/world/un-rejects-u-s-resolution-urging-an-end-to-the-war-in-ukraine-without-mentioning-m