# Text Clustering Notebook

This template loads search text data, embeds it with **`sentence-transformers/all-MiniLM-L6-v2`**, clusters the embeddings using **HDBSCAN** _and_ **KMeans**, derives an intuitive name for each cluster (using an LLM) and finally visualizes everything with interactive **Plotly** scatter‑plots.

## Step 0 – Install dependencies (uncomment if needed)

In [56]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import hdbscan
import numpy as np
import umap
import plotly.express as px
import duckdb
from transformers import pipeline
from openai import OpenAI
from collections import defaultdict
import tiktoken
import random
import textwrap
from dotenv import load_dotenv
import os
load_dotenv()

True

## Step 1 – Load data

In [None]:
df = pd.read_csv('ccbd-data.csv')
df = df[df['originalTimestamp'] >= '2025-01-01']
print(df.shape)
print(df.columns)
df.head()

(9771, 44)
Index(['anonymousId', 'messageId', 'userId', 'originalTimestamp', 'userAgent',
       'page.path', 'page.referrer', 'page.search', 'page.title', 'page.url',
       'userAgentData.brands', 'userAgentData.mobile',
       'userAgentData.platform', 'business_id', 'sponsored_listing',
       'category', 'name', 'path', 'referrer', 'search', 'title', 'url',
       'category_properties', 'name_properties', 'button_name', 'location',
       'city', 'search_text', 'parish', 'traits', 'input_num', 'input_name',
       'email', 'input_value', 'address', 'other_category', 'description',
       'display_email', 'business_name', 'main_category', 'phone',
       'sub_category', 'website', 'event_type'],
      dtype='object')


Unnamed: 0,anonymousId,messageId,userId,originalTimestamp,userAgent,page.path,page.referrer,page.search,page.title,page.url,...,address,other_category,description,display_email,business_name,main_category,phone,sub_category,website,event_type
0,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859266475-8684ff92-2b30-4d09-ba3f...,,2025-04-28 16:54:26.475000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,impression
1,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859265303-246f8684-ff92-4b30-9d09...,,2025-04-28 16:54:25.303000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,impression
2,e484246f-8684-4f92-ab30-1d093a3f5f60,ajs-next-1745859265295-13e48424-6f86-44ff-922b...,,2025-04-28 16:54:25.295000+00:00,Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/...,/,https://www.facebook.com/,?fbclid=IwZXh0bgNhZW0CMTEAAR58oxCxVDihSoLzwMP_...,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/...,...,,,,,,,,,,page
3,39b9e148-9871-4dae-8fe9-b1f5845d3932,ajs-next-1745860775776-e2d11c37-72f4-449e-bbf8...,1743547621069x279645472884266780,2025-04-28 17:19:35.776000+00:00,Mozilla/5.0 (iPhone; CPU iPhone OS 18_4_1 like...,/,https://bit.ly/,,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/,...,,,,,,,,,,impression
4,39b9e148-9871-4dae-8fe9-b1f5845d3932,ajs-next-1745860775813-1c3772f4-d49e-4bf8-b69d...,1743547621069x279645472884266780,2025-04-28 17:19:35.813000+00:00,Mozilla/5.0 (iPhone; CPU iPhone OS 18_4_1 like...,/,https://bit.ly/,,Colorado Catholic Business Directory,https://coloradocatholicbusinessdirectory.com/,...,,,,,,,,,,impression


In [58]:
query = """
SELECT lower(search_text) as search_text
FROM df
WHERE event_type = 'search'
and search_text is not null
"""
search_counts = duckdb.sql(query).df()
print(search_counts.shape)
search_counts.head(10)

(129, 1)


Unnamed: 0,search_text
0,sports
1,accounting
2,accounting
3,accounting
4,indoor painting
5,senior helpers
6,slider
7,decks
8,decks
9,painting


## Step 2 – Embed text with MiniLM

In [59]:
TEXT_COL = 'search_text'
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(
    search_counts[TEXT_COL].tolist(),
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)


`encoder_attention_mask` is deprecated and will be removed in version 4.55.0 for `BertSdpaSelfAttention.forward`.

Batches: 100%|██████████| 3/3 [00:00<00:00, 32.95it/s]


## Step 3 – Cluster embeddings

In [60]:
# --- KMeans ---
NUM_CLUSTERS = 7               # change this!
km = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init='auto')
km_labels = km.fit_predict(embeddings)

# --- HDBSCAN ---
hdb = hdbscan.HDBSCAN(min_cluster_size=NUM_CLUSTERS, metric='euclidean')
hdb_labels = hdb.fit_predict(embeddings)

# Attach labels to dataframe
search_counts['kmeans_label']   = km_labels
search_counts['hdbscan_label']  = hdb_labels


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



## Step 4 – Name clusters by their most central query

In [61]:
# Aggregate by KMeans labels
kmeans_counts = search_counts.groupby(['kmeans_label', 'search_text']).size().reset_index(name='count')
kmeans_counts = kmeans_counts.sort_values(['kmeans_label', 'count'], ascending=[True, False])

# Aggregate by HDBSCAN labels 
hdbscan_counts = search_counts.groupby(['hdbscan_label', 'search_text']).size().reset_index(name='count')
hdbscan_counts = hdbscan_counts.sort_values(['hdbscan_label', 'count'], ascending=[True, False])

print(kmeans_counts.head(10))
print(hdbscan_counts.head(10))

    kmeans_label          search_text  count
6              0            painting       8
5              0             painting      5
0              0                decks      4
2              0          embroidery       3
1              0           embroidery      1
3              0      indoor painting      1
4              0              painter      1
8              1              dentist      7
11             1               kcraft      4
10             1  general contractor       3
    hdbscan_label              search_text  count
15             -1                    decks      4
30             -1                   kcraft      4
0              -1                       ac      3
19             -1                     hair      3
34             -1              mechanical       3
5              -1                     bees      2
16             -1  emmaus catholic hospice      2
18             -1               fireplace       2
22             -1                homeopath      2
26   

In [None]:
MODEL = "gpt-3.5-turbo"            # cheapest chat model
client = OpenAI()

# Helper: ensure prompt stays in model’s token limit
enc = tiktoken.encoding_for_model(MODEL)
def too_long(rows, max_tokens=4000):
    prompt = ' '.join(rows)
    return (len(enc.encode(prompt)) + 30) > max_tokens

def get_cluster_name(rows):
    if too_long(rows):
        prompt_rows = rows[:20] + rows[-20:]
    else:
        prompt_rows = rows

    user_prompt = f"""Given this data {str(prompt_rows)},
    what is a good one or two word category for a search term scatter plot to understand what users on a business directory are searching for.
    Only return the category name."""
    
    response = client.responses.create(
        model=MODEL,
        instructions="You are a coding assistant that only gives one or two word answers.",
        input=user_prompt
    )

    return response.output_text

In [63]:
# Get AI-generated names for each cluster
kmeans_ai_names = {}
hdbscan_ai_names = {}

# Get KMeans cluster names
for label in kmeans_counts['kmeans_label'].unique():
    if label == -1:  # Skip noise cluster if present
        continue
    cluster_searches = kmeans_counts[kmeans_counts['kmeans_label'] == label]['search_text'].tolist()
    kmeans_ai_names[label] = get_cluster_name(cluster_searches)

# Get HDBSCAN cluster names  
for label in hdbscan_counts['hdbscan_label'].unique():
    if label == -1:  # Skip noise cluster
        continue
    cluster_searches = hdbscan_counts[hdbscan_counts['hdbscan_label'] == label]['search_text'].tolist()
    hdbscan_ai_names[label] = get_cluster_name(cluster_searches)

# Map AI names back to main dataframe
search_counts['kmeans_name'] = search_counts['kmeans_label'].map(kmeans_ai_names).fillna('Noise')
search_counts['hdbscan_name'] = search_counts['hdbscan_label'].map(hdbscan_ai_names).fillna('Noise')

print('\nKMeans AI cluster names:')
for cid, name in kmeans_ai_names.items():
    print(f'  {cid}: {name}')

print('\nHDBSCAN AI cluster names:') 
for cid, name in hdbscan_ai_names.items():
    print(f'  {cid}: {name}')



KMeans AI cluster names:
  0: Keywords
  1: Services
  2: Healthcare
  3: finance
  4: Products
  5: Services
  6: Construction

HDBSCAN AI cluster names:
  0: Art.
  1: Services
  2: Business departments


## Step 5 – Reduce to 2‑D with UMAP & visualize

In [None]:
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
proj = reducer.fit_transform(embeddings)
search_counts['x'], search_counts['y'] = proj[:,0], proj[:,1]

# Define a distinct color palette
colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3', '#ff7f00', '#ffff33', '#a65628', '#f781bf']



# --- HDBSCAN plot ---
fig_hdb = px.scatter(
    search_counts, x='x', y='y',
    color='hdbscan_name',
    hover_data=[TEXT_COL, 'hdbscan_name'],
    title='HDBSCAN Clusters',
    color_discrete_sequence=colors
)
fig_hdb.show()

# --- KMeans plot ---
fig_km = px.scatter(
    search_counts, x='x', y='y',
    color='kmeans_name',
    hover_data=[TEXT_COL, 'kmeans_name'],
    title='KMeans Clusters',
    color_discrete_sequence=colors
)
fig_km.show()


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.

