1. Initialize the libraries

In [41]:
import os
from dotenv import load_dotenv
from defeed_index import Registry, ActivityRepository, ActivityRepositoryConfig

load_dotenv()

repository = ActivityRepository(ActivityRepositoryConfig(
    host=os.getenv('DB_HOST'),
    port=int(os.getenv('DB_PORT')),
    database=os.getenv('DB_NAME'),
    user=os.getenv('DB_USER'),
    password=os.getenv('DB_PASSWORD')
))

registry = Registry(repository)

2. Seed the registry with latest activities

In [42]:
registry.seed()
registry.topic_model.get_topic_info()

Batches: 100%|██████████| 71/71 [00:09<00:00,  7.27it/s]
2025-09-23 12:29:14,058 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-23 12:29:20,830 - BERTopic - Dimensionality - Completed ✓
2025-09-23 12:29:20,831 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-23 12:29:20,907 - BERTopic - Cluster - Completed ✓
2025-09-23 12:29:20,907 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-09-23 12:29:21,015 - BERTopic - Representation - Completed ✓
2025-09-23 12:29:21,016 - BERTopic - Topic reduction - Reducing number of topics
2025-09-23 12:29:21,025 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-23 12:29:22,000 - BERTopic - Representation - Completed ✓
2025-09-23 12:29:22,001 - BERTopic - Topic reduction - Reduced number of topics from 3 to 3


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1195,-1_reinforcement learning_optimization_adaptiv...,"[reinforcement learning, optimization, adaptiv...",[Bayesian Ego-graph inference for Networked Mu...
1,0,621,0_3d gaussian_gaussian splatting_multimodal_sp...,"[3d gaussian, gaussian splatting, multimodal, ...",[MedGS: Gaussian Splatting for Multi-Modal 3D ...
2,1,450,1_linux_os_ubuntu_nvidia,"[linux, os, ubuntu, nvidia, windows, macos, cr...",[End of 10 event success - Both.org: End of 10...


In [47]:

embeddings = registry.embedding_model.encode(registry.documents, show_progress_bar=True)

Batches: 100%|██████████| 71/71 [00:07<00:00,  9.33it/s]


In [52]:
topics, probabilities = registry.topic_model.fit_transform(registry.documents, embeddings)
set(topics)

2025-09-23 12:34:06,974 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-23 12:34:13,668 - BERTopic - Dimensionality - Completed ✓
2025-09-23 12:34:13,669 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-23 12:34:13,720 - BERTopic - Cluster - Completed ✓
2025-09-23 12:34:13,721 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-09-23 12:34:13,795 - BERTopic - Representation - Completed ✓
2025-09-23 12:34:13,796 - BERTopic - Topic reduction - Reducing number of topics
2025-09-23 12:34:13,800 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-23 12:34:15,269 - BERTopic - Representation - Completed ✓
2025-09-23 12:34:15,270 - BERTopic - Topic reduction - Reduced number of topics from 3 to 3


{-1, 0, 1}

In [53]:
registry.topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1195,-1_reinforcement learning_optimization_adaptiv...,"[reinforcement learning, optimization, adaptiv...",[Bayesian Ego-graph inference for Networked Mu...
1,0,621,0_3d gaussian_gaussian splatting_multimodal_sp...,"[3d gaussian, gaussian splatting, multimodal, ...",[MedGS: Gaussian Splatting for Multi-Modal 3D ...
2,1,450,1_linux_os_ubuntu_nvidia,"[linux, os, ubuntu, nvidia, windows, macos, cr...",[End of 10 event success - Both.org: End of 10...


3. List topics

In [40]:
import matplotlib.pyplot as plt
import datamapplot  # Required for BERTopic visualization

# First, let's check how many topics we actually have
print(f"Number of topics found: {len(set(registry.topics))}")
print(f"Topic distribution: {set(registry.topics)}")

# Get the actual number of topics (excluding -1 which is noise)
actual_topics = [t for t in set(registry.topics) if t != -1]
print(f"Non-noise topics: {len(actual_topics)}")

# Use the actual number of topics for visualization
if len(actual_topics) > 0:
    fig = registry.topic_model.visualize_document_datamap(
        registry.documents,
        topics=actual_topics,  # Use actual topics instead of range(20)
        width=1200,
        height=800
    )
    plt.savefig("datamapplot.png", dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No valid topics found for visualization")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Number of topics found: 5
Topic distribution: {0, 1, 2, 3, -1}
Non-noise topics: 4


NameError: name 'datamapplot' is not defined