In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN
from umap import UMAP

# --- 1. PREPROCESSING SETUP ---
print("Setting up cleaning tools...")
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
custom_stops = {'page', 'http', 'https', 'www', 'com', 'paragraph', 'table', 'figure', 'section'} 
stop_words.update(custom_stops)

def clean_text(text):
    if not isinstance(text, str): return ""
    # Lowercase & remove non-text characters
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize, remove stopwords, lemmatize
    words = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words and len(w) > 2]
    return " ".join(words)

# --- 2. LOAD DATA ---
csv_path = "output.csv"
print(f"Loading {csv_path}...")
df = pd.read_csv(csv_path)
df = df.dropna(subset=['text'])

# Apply cleaning
df['clean_text'] = df['text'].apply(clean_text)
df = df[df['clean_text'].str.len() > 5] # Remove empty results
docs = df['clean_text'].tolist()

print(f"Processing {len(docs)} documents...")

# --- 3. CONFIGURE "HYPER-GRANULAR" SETTINGS ---

# UMAP: Local Focus
# n_neighbors=2 forces the model to only look at very immediate similarities.
# This prevents it from grouping vaguely similar ideas together.
umap_model = UMAP(
    n_neighbors=2,       # EXTREME GRANULARITY (Default is 15)
    n_components=5, 
    min_dist=0.0, 
    metric='cosine'
)

# HDBSCAN: Tiny Clusters
# min_cluster_size=3 means a topic can be created with just 3 paragraphs.
# min_samples=1 ensures almost NO data is treated as noise/outliers.
hdbscan_model = HDBSCAN(
    min_cluster_size=3,  # EXTREME GRANULARITY (Default is 10)
    min_samples=1,       # PREVENTS OUTLIERS (Default is usually larger)
    metric='euclidean', 
    cluster_selection_method='eom', 
    prediction_data=True
)

# Vectorizer: Allow rare words to define topics
vectorizer_model = CountVectorizer(stop_words="english", min_df=2)

# --- 4. RUN MODEL ---
print("Fitting Hyper-Granular Model (this may generate hundreds of topics)...")

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    language="english",
    verbose=True,
    nr_topics=None # CRITICAL: Do not merge topics
)

topics, probs = topic_model.fit_transform(docs)

# --- 5. REDUCE REMAINING OUTLIERS ---
# With min_samples=1, outliers should be low, but this forces 100% assignment
try:
    new_topics = topic_model.reduce_outliers(docs, topics)
    topic_model.update_topics(docs, topics=new_topics)
    print("Outliers reduced successfully.")
except Exception as e:
    print("Skipping outlier reduction (likely almost 0 outliers exists).")

# --- 6. EXPORT RESULTS ---
freq = topic_model.get_topic_info()
print(f"\nTotal Topics Found: {len(freq) - 1}")

# Export to CSV
freq.to_csv("hyper_granular_topics.csv", index=False)
print("Saved to 'hyper_granular_topics.csv'")

# Visualize top 50 (since you will have many)
fig = topic_model.visualize_barchart(top_n_topics=50)
fig.write_html("hyper_granular_chart.html")
print("Visualization saved.")

Setting up cleaning tools...
Loading output.csv...


2026-01-15 11:10:14,351 - BERTopic - Embedding - Transforming documents to embeddings.


Processing 528 documents...
Fitting Hyper-Granular Model (this may generate hundreds of topics)...


Batches:   0%|          | 0/17 [00:00<?, ?it/s]

2026-01-15 11:10:25,844 - BERTopic - Embedding - Completed ✓
2026-01-15 11:10:25,847 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-15 11:10:27,032 - BERTopic - Dimensionality - Completed ✓
2026-01-15 11:10:27,034 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-15 11:10:27,129 - BERTopic - Cluster - Completed ✓
2026-01-15 11:10:27,139 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-15 11:10:27,269 - BERTopic - Representation - Completed ✓
100%|██████████| 1/1 [00:00<00:00, 126.51it/s]


Outliers reduced successfully.

Total Topics Found: 107
Saved to 'hyper_granular_topics.csv'
Visualization saved.


In [3]:
# --- 5. Get All Topic Information ---

# Get the full dataframe of topic info
topic_info = topic_model.get_topic_info()

# Method A: Save to CSV (Recommended for viewing all topics easily)
topic_info.to_csv("all_topics_list.csv", index=False)
print(f"Success! Exported {len(topic_info)} topics to 'all_topics_list.csv'.")

# Method B: Print all topics to the console
# We change pandas settings to ensure rows aren't hidden
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

print("\n--- ALL TOPICS ---")
print(topic_info[['Topic', 'Count', 'Name']])

# Reset pandas options (optional)
pd.reset_option('display.max_rows')

Success! Exported 14 topics to 'all_topics_list.csv'.

--- ALL TOPICS ---
    Topic  Count                                               Name
0      -1    206             -1_quantum_technology_research_science
1       0     56               0_canada_canadian_quantum_government
2       1     49                  1_talent_skill_workforce_training
3       2     39            2_quantum_technology_computing_computer
4       3     28    3_postquantum_cryptography_communication_secure
5       4     23                  4_quantum_regulatory_business_use
6       5     23           5_computing_computer_quantum_application
7       6     22  6_international_collaboration_partner_partnership
8       7     20               7_quantum_bmwk_communication_imaging
9       8     16                       8_government_set_qi_strategy
10      9     13                   9_security_risk_national_control
11     10     13                      10_program_million_year_grant
12     11     10         11_direct_availab