## Preliminaries

In [1]:
# Imports

# ============ General ============
import json
import pandas as pd
import numpy as np
import time
from datetime import datetime, timezone, timedelta
from typing import Optional, List
import math
from itertools import chain

# ============ Plotting ============
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

# ============ Text Preprocessing  ============
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# spaCy for lemmatization/POS filtering
try:
    import spacy
    _SPACY_AVAILABLE = True
except ImportError:
    _SPACY_AVAILABLE = False

# ============ BERTopic stack ============
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# Repro + warnings
import random
import warnings
warnings.filterwarnings("ignore")
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Ensure NLTK resources are available
# for _res in ["stopwords", "wordnet", "omw-1.4", "punkt"]:
#     try:
#         nltk.data.find(f"corpora/{_res}")
#     except LookupError:
#         nltk.download(_res)


In [2]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA"

In [3]:
# Download NLTK files (run once)

# nltk.download('stopwords')

## Helper functions 

In [4]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [5]:
# Function to get datetime from UTC timestamp

def dt_from_epoch(ts: Optional[int]):

    """
    Convert timestamp to pd.datetime format
    """

    
    if ts is None:
        return None
    return pd.to_datetime(ts, unit="s", utc=True)

In [6]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

In [7]:
# Function to get sample from text column

def get_text_samples(df: pd.DataFrame, text_col: str, n: int) -> None:

    '''
    Print n samples from a text column in a dataframe
    '''

    # Ensure pandas doesn't truncate text
    pd.set_option('display.max_colwidth', None)
    
    # Sample and print 5 full negative reviews
    print("Sample text data:\n\n")
    sample = df[text_col].sample(n)
    for i, description in enumerate(sample, 1):
        print(f"Text sample {i}:\n\n\n{description}\n\n\n")

In [8]:
# Function for categorical bar graph

def bar_graph(df: pd.DataFrame, col: str) -> None:

    """
    Generate bar graph for categorical column in a dataframe
    """

    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")

    counts = posts_df[col].value_counts()
    
    plt.figure(figsize=(10,6))
    counts.plot(kind="bar")
    plt.title(f"Distribution of {col.title()}")
    plt.xlabel(f"{col.title()}")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

In [9]:
# Define function to plot histogram for numeric columns

def histogram(df: pd.DataFrame, 
             col: str,
            bins: int = 30,
             log: bool = False) -> None:
    
    """
    Generate a histogram for a numeric column in a dataframe.
    """
    
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")
    
    plt.figure(figsize=(8, 5))
    df[col].dropna().hist(bins=bins, edgecolor="black", log=log)
    plt.title(f"Histogram of {col.title()}")
    plt.xlabel(col.title())
    plt.ylabel("Log(Frequency)" if log else "Frequency")
    plt.tight_layout()
    plt.show()

In [10]:
# Function to load ndjson

def load_plain_ndjson(path: str, limit: Optional[int] = None) -> pd.DataFrame:
    
    """
    Load a plain-text NDJSON file line by line into a DataFrame.
    """
    
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if not line.strip():
                continue
            obj = json.loads(line)

            rows.append({
                "post_id": obj.get("id"),
                "timestamp": dt_from_epoch(obj.get("created_utc")),
                "author": obj.get("author"),
                "title": obj.get("title"),
                "text": obj.get("selftext"),
                "score": obj.get("score"),
                "num_comments": obj.get("num_comments"),
                "permalink": obj.get("permalink"),
                "subreddit": obj.get("subreddit"),
            })

            if limit and i >= limit:
                break

    return pd.DataFrame(rows)

## Load and Inspect Data

In [11]:
# Load clean data

lulu_df = pd.read_parquet(f"{PATH}/lululemon_submissions_clean.parquet", engine = 'fastparquet')

In [35]:
# Examine data

examine_df('lulu dataframe', lulu_df)



Number of records in the lulu dataframe is: 57984


Number of features in the lulu dataframe is: 6

The columns in the lulu dataframe are: Index(['post_id', 'timestamp', 'title', 'text', 'score', 'num_comments'], dtype='object')


 Other info about lulu dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57984 entries, 0 to 57983
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       57984 non-null  object             
 1   timestamp     57984 non-null  datetime64[ns, UTC]
 2   title         57984 non-null  object             
 3   text          57984 non-null  object             
 4   score         57984 non-null  int64              
 5   num_comments  57984 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(3)
memory usage: 2.7+ MB


None


 Basic statistical info about lulu dataframe:



Unnamed: 0,score,num_comments
count,57984.0,57984.0
mean,23.446071,14.705126
std,87.240166,40.279924
min,0.0,0.0
25%,1.0,2.0
50%,3.0,6.0
75%,13.0,13.0
max,11864.0,1987.0




Sample of records in the lulu dataframe:


Unnamed: 0,post_id,timestamp,title,text,score,num_comments
0,eielly,2020-01-01 05:33:25+00:00,Monthly Sales Post- January,FS: Aligns sz 4,1,7
1,eii06s,2020-01-01 12:46:35+00:00,Major problem falling down leggings?,"Hello, over the last year I have been ordering lululemon stuff online as there is no store nearby. I have been trying different sizes and overall I feel their leggings are pretty bad at holding up. Especially wunder under and all the right places. I have wu in luon which is fine, and aligns are ok, new in movement seem to be ok. But align and luon get pilling issues, in movement fabric gets dust/feather sticking to it after 2 wears. In the same the luxtreme fabric in wunder under.. impossible. I don't feel i will get in the smaller size, as my thighs are huge (smaller waist). Meanwhile I tried leggings from other brands... Eg Alo yoga, very similar model (extreme high waist airlift vs super high waist wu ) and the others are performing great. In alo i have leggings in s,m and l size,none is falling. I think i am getting really disappointed. It's very addictive to shop from lululemon and they look great in the mirror, but at the same time I cannot see the worth in comfort and performance during practice (yoga, both classical and more powerful,fitness) . Am I doing something wrong? I am afraid to invest even more $$, to try more models and risk having the same flows... I am also considering redesigning WU, adding a stich or elastic band to their waist..anyone having experience with that?",0,6
2,eijtca,2020-01-01 16:00:56+00:00,Tops for yoga,I have a couple swiftly tech racerbacks for hot yoga and just ordered a swiftly breeze tank because I’m wanting something with a bit more coverage (higher neckline). I find I’m adjusting my racerbacks more than I’d like in yoga but will keep wearing for hot yoga since they wick sweat so well. \n\nWhat are everyone’s favorite tops/tanks to wear for yoga? I’m a 34D so I worry about having enough coverage and not falling out of bras and tops. I have the free to be serene bra but I don’t like wearing it for yoga for this reason. Would love to find a great top (preferably a tank) that I don’t need to adjust or worry about with all the forward folds and down dogs. I knew lulu makes high neck bras but I don’t like the idea of something tight across my collarbone.,3,4
3,eikiew,2020-01-01 16:59:27+00:00,ABC Pants - Sizing,"Hey all,\n\nI recently received ABC pants (size 32) from the store, and they fit well. However, I’ve heard from multiple friends that they lose their shape and stretch out/can become baggy after a couple of weeks-months of wear. Should I go back to the store and try a size 31 in anticipation of this? \n\nIf not, will lululemon replace these pants should they lose their shape? \n\nThanks!",1,6
4,eil4bb,2020-01-01 17:46:20+00:00,Certain Aligns colours with thicker fabric?,"Hi lemonheads :D\n\nI was wondering if anyone has bought a pair of aligns that seems to be made from a thicker fabric than the usual Nulu. I think I saw a post a while ago where a few people thought their aligns were thicker, they were all dark red colour but I don't remember the exact name (not Garnet!)\n\nI recently purchased a pair of full length Aligns off Poshmark in the colour Graphite, but they also seem to be thicker, almost luon. I thought maybe they were WUs, but don't believe WUs come in that colour? I've attached a photo of the size dot (it looks real to me?), though I'm not sure how to read it or where I would check. Any advice or thoughts would be great! \n\nhttps://preview.redd.it/8zq5q04lf7841.jpg?width=3024&amp;format=pjpg&amp;auto=webp&amp;s=acaa777817edaeacc94b1effefd42b6f3d4f8c18",3,11


In [13]:
# Copy original dataframe

og_lulu_df = lulu_df.copy()

## Preprocessing

In [36]:
# Reset dataframe

lulu_df = og_lulu_df.copy()

In [37]:
# Define stop word list, lemmatizer, and regex

# Stopwords

custom_stop_words = [
    # brand/boilerplate
    "lululemon", "lulu", "amp", "xx", "lol",
    "like", "get", "got", "would", "anyone", "one",

    # deletion/removal artifacts
    "deleted", "remove", "removed", "removal",
    "deleted_view", "removed_view", "view_poll", "poll_view",
    "deleted_view_poll", "removed_view_poll",
    "view", "poll", "results", "result", "vote", "votes",
    "thread", "post", "posting", "posted", "comment", "comments",

    # generic low-information Reddit junk
    "http", "https", "www", "com",
    "imgur", "jpg", "png", "gif",
    "subreddit", "reddit", "mod", "mods",
    "link", "links",

    # Scraped filler 
    "user", "account", "profile",
    "page", "site", "website",
    "viewed", "views", "seen"
]

base_stops = set(stopwords.words("english"))
base_stops -= {"no", "nor", "not", "never"}       # Keep negations

stop_words = list(base_stops.union(custom_stop_words))

lemmatizer = WordNetLemmatizer()

# Precompile regex
_link = re.compile(r'https?://\S+|www\.\S+')
_nonalpha = re.compile(r'[^a-z\s]')
_spaces = re.compile(r'\s+')

In [38]:
# Define text preprocessor

def preprocess(text: str) -> str:

    """
    Preprocess text before modeling
    """
    
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = _link.sub(" ", text)         # Remove links
    text = _nonalpha.sub(" ", text)     # Keep only letters/spaces
    tokens = []
    for t in text.split():
        if t in stop_words or len(t) < 3:
            continue
        t = lemmatizer.lemmatize(t)
        tokens.append(t)
    return _spaces.sub(" ", " ".join(tokens)).strip()

In [39]:
# Apply text preprocessing

lulu_df["clean_text"] = lulu_df["title"].fillna("") + " " + lulu_df["text"].fillna("")
lulu_df["clean_text"] = lulu_df["clean_text"].apply(preprocess)

# drop docs with <5 tokens to reduce noise
lulu_df = lulu_df[lulu_df["clean_text"].str.split().str.len() >= 5].reset_index(drop=True)

In [40]:
# Check some examples

get_text_samples(lulu_df, 'clean_text', 5)

Sample text data:


Text sample 1:


experienced pilling high rise stretch jogger thicker thigh obviously leg constantly rubbing together trying see worth exchanging adapt state jogger appreciate thought



Text sample 2:


group interview interview tommorw morning educator position local store credential following bachelor science nutrition science currently first year second semester master public health degree specilization dietetics program combine supervised practice hour become registered dietitican graduation shift supervisor last three year high volume cannabis dispensary see upwards people per day sale per day main question type question prepared know entry level retail position appreciate run expect never group interview thanks much



Text sample 3:


fianc fit looking fire description item featured photo rain chaser jacket black never lost keychains black grey grey sage colour align tights black swiftly top orange soda



Text sample 4:


gilroy outlet haul made stop gilroy

In [41]:
# Collect tokenized docs

tokenized_docs = [doc.split() for doc in lulu_df["clean_text"]]

In [42]:
# Function to merge bigrams

def merge_bigrams(doc, bigram_set):
    
    """
    Merge bigrams
    """
    
    merged = []
    i = 0
    while i < len(doc):
        if i < len(doc)-1 and (doc[i], doc[i+1]) in bigram_set:
            merged.append(f"{doc[i]}_{doc[i+1]}")
            i += 2
        else:
            merged.append(doc[i])
            i += 1
    return merged

In [43]:
# Apply bigram detector

bigrams = [list(ngrams(doc, 2)) for doc in tokenized_docs]
flat_bigrams = [bg for doc in bigrams for bg in doc]
bigram_counts = Counter(flat_bigrams)
common_bigrams = {bg for bg, count in bigram_counts.items() if count >= 10}

tokenized_bigrams = [merge_bigrams(doc, common_bigrams) for doc in tokenized_docs]

In [44]:
# Add bigram text back to dataframe

lulu_df["clean_text_bigram"] = [" ".join(doc) for doc in tokenized_bigrams]

In [45]:
# Check some examples

get_text_samples(lulu_df, 'clean_text_bigram', 5)

Sample text data:


Text sample 1:


cloud wear_size regular_length



Text sample 2:


nomad perfect spring khaki scored wunder_train racer_back tank_nomad wmtm_love style weightlift find_fit much_comfortable align_tank thought nomad brown hued based pic pull khaki



Text sample 3:


lil reveal mesh mid_rise reveal mid_rise mesh posy design black_size cloud_bra black_size legging released mid_rise considered high



Text sample 4:


align_bra discontinued new_color drop regular_align bra_not shoulder neck know_discontinued sad neck cute low_cut liking not_even part big tittee committee



Text sample 5:


style amalfi coast lounge_wear cute_outfit recs beach sightseeing love_wunder train_fast free_legging feel athletic casual





In [46]:
# Get list of docs

docs = lulu_df["clean_text_bigram"].tolist()

## Modeling with BERTopic

In [47]:
# Load baseline sentence embedding model

EMB_NAME = "all-MiniLM-L6-v2"
st_model = SentenceTransformer(EMB_NAME)

In [48]:
# Generate embeddings

embs = st_model.encode(
    docs,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # Normalizes vectors for cosine sim
)

Batches:   0%|          | 0/1768 [00:00<?, ?it/s]

In [179]:
# Configure UMAP model

umap_model = UMAP(
    n_neighbors = 30,
    n_components=5,
    min_dist = 0.20,
    metric="cosine",
    random_state=42,
    verbose=False,
)

In [180]:
# Configure DBSCAN model

N = len(docs)

min_cluster_size = max(200, math.floor(0.015 * N))  # ≈0.5% of corpus, at least 50

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples= 35,  # defaults to min_cluster_size; set e.g. 10–30 to merge a bit more
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
    cluster_selection_epsilon = 0.00
)




In [181]:
# Define vectorizer to keep unigrams and bigrams

vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b[\w_]{3,}\b",
    stop_words = None,
        max_df = .9)


In [182]:
# Build BERTopic model

topic_model = BERTopic(
    embedding_model=st_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    language="english",
    calculate_probabilities=True,
    verbose=True,
    top_n_words=15
)

In [183]:
# Fit BERTopic model

start = time.time()

topics, probs = topic_model.fit_transform(docs, embs)

end = time.time()

runtime = (end-start)/60

print(f"Full runtime: {runtime:.2f} minutes.")

2025-09-06 17:29:27,475 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-06 17:31:08,347 - BERTopic - Dimensionality - Completed ✓
2025-09-06 17:31:08,350 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-06 17:31:19,041 - BERTopic - Cluster - Completed ✓
2025-09-06 17:31:19,054 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-09-06 17:31:29,220 - BERTopic - Representation - Completed ✓


Full runtime: 2.03 minutes.


In [184]:
# Examine basic topic info

# Topic table and basic stats
topic_info = topic_model.get_topic_info()
topic_info.head(10)

# Number of discovered topics (exclude -1 = outliers)
n_topics = int((topic_info["Topic"] != -1).sum())
n_docs = len(docs)
outlier_share = (topics.count(-1) / n_docs) if n_docs else 0.0

print(f"Discovered topics (excl. -1): {n_topics}")
print(f"Outlier docs (-1): {topics.count(-1)} / {n_docs} = {outlier_share:.1%}")

# Topic size distribution
topic_info[["Topic", "Count"]].head(20)

Discovered topics (excl. -1): 3
Outlier docs (-1): 2958 / 56574 = 5.2%


Unnamed: 0,Topic,Count
0,-1,2958
1,0,50478
2,1,1686
3,2,1452


In [225]:
# Check top words for each topic

# Refresh topic_info from the current model
topic_info = topic_model.get_topic_info().copy()

# Get all non-noise topics
topic_ids = topic_info.loc[topic_info["Topic"] != -1, "Topic"].tolist()

for t in topic_ids:
    # safe size lookup
    size = int(topic_info.loc[topic_info["Topic"] == t, "Count"].iloc[0])
    print("="*80)
    print(f"\nTopic {t} | size={size}")

    # top words (guard if topic exists but empty)
    words = topic_model.get_topic(t) or []
    top_terms = ", ".join([w for w, _ in words[:15]]) if words else "(no terms)"
    print("\nTop words:", top_terms)

    # representative examples (guard empty)
    reps = (topic_model.get_representative_docs(t) or [])[:3]
    for i, doc in enumerate(reps, 1):
        preview = doc[:300].replace("\n", " ")
        suffix = "..." if len(doc) > 300 else ""
        print(f"\n--- Ex{i}: {preview}{suffix}")

print("\nNon-noise topics:", topic_ids)



Topic 0 | size=50478

Top words: high_rise, bra, jogger, energy_bra, long_sleeve, hoodie, gift_card, hotty_hot, dance_studio, align_short, sport_bra, swiftly_tech, ebb_street, restock, leg

--- Ex1: usa new_item align_short diamond_dye vista_green saddle_brown align_short saddle_brown see_horizon tank_dune black swift_speed bra_black stroll_sundown short_cherry tint carnation_red true_navy black femme force crop_tank white_black trip_taker skirt carnation_red beech_wood black muscle_love crop_t...

--- Ex2: new_item drop_super helpful_format thanks_advance item_name color_color linkhere energy_bra blue_black bra_energy bra_pink blossom cloud_longline bra_pink blossom top_ebb street_tank top_pink blossom cool_racerback short aero_blue align_tank waist_length pink_blossom love_tank soft_denim legging_ali...

--- Ex3: drop_happy drop_day please_format item_name color_color energy_bra black_blue bra_energy bra_medium support_cup scream_green light free_serene bra_light support_cup poolsid

In [186]:
# Check topic diversity

TOP_N = 10
words_per_topic = {
    t: [w for w, _ in topic_model.get_topic(t)[:TOP_N]]
    for t in topic_info["Topic"].tolist() if t != -1
}

all_top_words = list(chain.from_iterable(words_per_topic.values()))
diversity = len(set(all_top_words)) / max(1, len(all_top_words))
print(f"Topic diversity (for top {TOP_N} words for each topic): {diversity:.3f}")

Topic diversity (for top 10 words for each topic): 1.000


## Subclustering

In [190]:
# Choose biggest topic 

big_tid = 0 # index for big topic is 0

mask = (np.array(topics) == big_tid)
docs_big  = [d for d, m in zip(docs,  mask) if m]
embs_big  = embs[mask]
idx_big   = np.where(mask)[0]  # to map back later

In [218]:
# Set up new UMAP model

umap_sub = UMAP(
    n_neighbors=30,  
    n_components=5,
    min_dist=0.20,
    metric="cosine",
    random_state=42
)

In [219]:
# Set up new HDBSCAN model

M = len(docs_big)

hdb_sub = HDBSCAN(
    min_cluster_size=max(150, int(0.015 * M)),
    min_samples=45,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
    cluster_selection_epsilon=0.02   # small tolerance for merges
)

In [220]:
# Set up vectorizer

vec_sub = CountVectorizer(
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b[\w_]{3,}\b",
    stop_words=None,
    max_df=.95
)


In [221]:
# Set up new BERTopic model

sub_model = BERTopic(
    umap_model=umap_sub,
    hdbscan_model=hdb_sub,
    vectorizer_model=vec_sub,
    calculate_probabilities=True,
    verbose=True
)

In [222]:
# Fit submodel

start = time.time()

sub_topics, sub_probs = sub_model.fit_transform(docs_big, embs_big)

end = time.time()

runtime = (end - start)/60

print(f"Full runtime: {runtime}")

2025-09-06 19:15:04,339 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-06 19:16:21,122 - BERTopic - Dimensionality - Completed ✓
2025-09-06 19:16:21,126 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-06 19:16:29,284 - BERTopic - Cluster - Completed ✓
2025-09-06 19:16:29,306 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-09-06 19:16:38,612 - BERTopic - Representation - Completed ✓


Full runtime: 1.5750168601671855


In [223]:
# Examine subtopics

# Point the inspectors at the SUB model/run
MODEL  = sub_model
DOCS   = docs_big
TOPICS = list(sub_topics)   # ensure list so .count(-1) works

# ---------- 1) Basic topic info ----------
topic_info_sub = MODEL.get_topic_info().copy()

# Number of discovered topics (exclude -1 = outliers)
n_topics_sub   = int((topic_info_sub["Topic"] != -1).sum())
n_docs_sub     = len(DOCS)
outlier_share  = (TOPICS.count(-1) / n_docs_sub) if n_docs_sub else 0.0

print(f"Discovered subtopics (excl. -1): {n_topics_sub}")
print(f"Outlier docs (-1): {TOPICS.count(-1)} / {n_docs_sub} = {outlier_share:.1%}")

# Subtopic size distribution
display(topic_info_sub[["Topic", "Count"]].head(20))

Discovered subtopics (excl. -1): 2
Outlier docs (-1): 16964 / 50478 = 33.6%


Unnamed: 0,Topic,Count
0,-1,16964
1,0,31059
2,1,2455


In [227]:
# Check top words for each subtopic

# Refresh subtopic_info from the current model
subtopic_info =sub_model.get_topic_info().copy()

# Get all non-noise subtopics
subtopic_ids = subtopic_info.loc[topic_info["Topic"] != -1, "Topic"].tolist()

for t in subtopic_ids:
    # safe size lookup
    size = int(subtopic_info.loc[subtopic_info["Topic"] == t, "Count"].iloc[0])
    print("="*80)
    print(f"\nTopic {t} | size={size}")

    # top words (guard if subtopic exists but empty)
    words = sub_model.get_topic(t) or []
    top_terms = ", ".join([w for w, _ in words[:15]]) if words else "(no terms)"
    print("\nTop words:", top_terms)

    # representative examples (guard empty)
    reps = (sub_model.get_representative_docs(t) or [])[:3]
    for i, doc in enumerate(reps, 1):
        preview = doc[:300].replace("\n", " ")
        suffix = "..." if len(doc) > 300 else ""
        print(f"\n--- Ex{i}: {preview}{suffix}")

print("\nNon-noise topics:", subtopic_ids)



Topic 0 | size=31059

Top words: refund, sweatcollective, employee, highneck, interview, courtrival, code, policy, scubahalf, priceadjustment

--- Ex1: exchanging boxing_day sale_think saw_someone wanted_check wider audience policy allowing exchange boxing_day sale_item actually went_store today otf_jogger savannah changing_room local_store closed educator mentioned still_exchange sale_item different_size needed thru holiday return period not_sure ...

--- Ex2: tracking stuck mercari sale_item shipped item_sold mercari using usps label day_ago tracking updated week say package_arrive later expected currently transit next facility happen long_wait filing claim feel_bad buyer probably really_want short

--- Ex3: customer_support scam tldr ordered_item different address billing due helping brother health situation order_cancelled next_day tell unblock card unless submit credit_card statement info big_fan not_sure support longer recent_experience unfortunately brother bad_news serious hea

## Grid Searches

In [208]:
# Grid search on epsilon and min sample says for HDBSCAN

# reuse docs_big, embs_big, umap_sub, M (len(docs_big))
min_cluster_size = max(200, int(0.015* M))  # ~1% of subset size

ms_values  = [25,30,35,40]
eps_values = [0.02]

results = []

for ms in ms_values:
    for eps in eps_values:
        hdb_sub = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=ms,
            metric="euclidean",
            cluster_selection_method="eom",
            prediction_data=True,
            cluster_selection_epsilon=eps
        )

        vec_sub = CountVectorizer(
            ngram_range=(1, 2),
            token_pattern=r"(?u)\b[\w_]{3,}\b",
            stop_words=None,
            min_df=1,
            max_df=1.0
        )

        sub_model = BERTopic(
            umap_model=umap_sub,     # keep your existing UMAP config
            hdbscan_model=hdb_sub,
            vectorizer_model=vec_sub,
            calculate_probabilities=True,
            verbose=False
        )

        sub_topics, sub_probs = sub_model.fit_transform(docs_big, embs_big)
        sub_info = sub_model.get_topic_info()

        n_subtopics = (sub_info.Topic != -1).sum()
        noise_share = (np.array(sub_topics) == -1).mean()

        results.append((ms, eps, n_subtopics, noise_share))
        print(f"min_samples={ms:2d}, eps={eps:.2f} "
              f"-> subtopics={n_subtopics}, noise={noise_share:.1%}")


min_samples=25, eps=0.02 -> subtopics=10, noise=58.1%
min_samples=30, eps=0.02 -> subtopics=6, noise=52.0%
min_samples=35, eps=0.02 -> subtopics=9, noise=59.7%
min_samples=40, eps=0.02 -> subtopics=7, noise=51.2%


min_samples=20, eps=0.00 -> subtopics=12, noise=55.5%
min_samples=20, eps=0.02 -> subtopics=12, noise=55.5%
min_samples=20, eps=0.05 -> subtopics=12, noise=55.5%
min_samples=25, eps=0.00 -> subtopics=11, noise=56.9% min_samples=25, eps=0.02 -> subtopics=10, noise=58.1%
min_samples=30, eps=0.02 -> subtopics=6, noise=52.0%
min_samples=35, eps=0.02 -> subtopics=9, noise=59.7%
min_samples=40, eps=0.02 -> subtopics=7, noise=51.2%

In [211]:
# Function for grid searches

def run_once(mcs, ms, eps=0.02):
    hdb = HDBSCAN(
        min_cluster_size=mcs,
        min_samples=ms,
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True,
        cluster_selection_epsilon=eps
    )
    vec = CountVectorizer(
        ngram_range=(1, 2),
        token_pattern=r"(?u)\b[\w_]{3,}\b",
        stop_words=None,
        min_df=1,
        max_df=1.0
    )
    mdl = BERTopic(
        umap_model=umap_sub,
        hdbscan_model=hdb,
        vectorizer_model=vec,
        calculate_probabilities=True,
        verbose=False
    )
    t, _ = mdl.fit_transform(docs_big, embs_big)
    info = mdl.get_topic_info()
    k = int((info.Topic != -1).sum())
    noise = float((np.array(t) == -1).mean())
    print(f"mcs={mcs:4d}, min_samples={ms:2d}, eps={eps:.2f}  →  subtopics={k:2d}, noise={noise:.1%}")
    return (k, noise, mdl, t, info)

In [212]:
# Grid search 00

M = len(docs_big)

# SMALL grid (aiming to reduce both cluster count and noise)
mcs_fracs = [0.010, 0.012, 0.015, 0.02]      # ≈ 1.0%, 1.2%, 1.5% of subset
ms_values = [40, 45, 50]           # around your previous best
epsilon   = 0.02                       # mild merge tolerance

best = {"noise": 1.0, "k": 10**9, "cfg": None, "model": None, "topics": None, "info": None}

for frac in mcs_fracs:
    mcs = max(120, int(frac * M))
    for ms in ms_values:
        k, noise, mdl, t, info = run_once(mcs, ms, epsilon)
        # choose best by: lowest noise, then fewer clusters
        if (noise < best["noise"]) or (noise == best["noise"] and k < best["k"]):
            best.update({"noise": noise, "k": k, "cfg": (mcs, ms, epsilon),
                         "model": mdl, "topics": t, "info": info})

print("\nBest by (min noise, then fewer clusters):")
print(f"mcs={best['cfg'][0]}, min_samples={best['cfg'][1]}, eps={best['cfg'][2]:.2f}  "
      f"→  subtopics={best['k']}, noise={best['noise']:.1%}")

# Keep best handy if you want to inspect words/examples next:
best_sub_model  = best["model"]
best_sub_topics = best["topics"]
best_sub_info   = best["info"]

mcs= 504, min_samples=40, eps=0.02  →  subtopics=12, noise=56.1%
mcs= 504, min_samples=45, eps=0.02  →  subtopics=13, noise=62.8%
mcs= 504, min_samples=50, eps=0.02  →  subtopics=12, noise=55.4%
mcs= 605, min_samples=40, eps=0.02  →  subtopics=10, noise=58.3%
mcs= 605, min_samples=45, eps=0.02  →  subtopics=11, noise=65.0%
mcs= 605, min_samples=50, eps=0.02  →  subtopics=10, noise=57.6%
mcs= 757, min_samples=40, eps=0.02  →  subtopics= 7, noise=51.2%
mcs= 757, min_samples=45, eps=0.02  →  subtopics= 2, noise=33.6%
mcs= 757, min_samples=50, eps=0.02  →  subtopics=10, noise=57.6%
mcs=1009, min_samples=40, eps=0.02  →  subtopics= 7, noise=51.2%
mcs=1009, min_samples=45, eps=0.02  →  subtopics= 7, noise=58.7%
mcs=1009, min_samples=50, eps=0.02  →  subtopics= 7, noise=46.9%

Best by (min noise, then fewer clusters):
mcs=757, min_samples=45, eps=0.02  →  subtopics=2, noise=33.6%


mcs= 504, min_samples=40, eps=0.02  →  subtopics=12, noise=56.1%
mcs= 504, min_samples=45, eps=0.02  →  subtopics=13, noise=62.8%
mcs= 504, min_samples=50, eps=0.02  →  subtopics=12, noise=55.4%
mcs= 605, min_samples=40, eps=0.02  →  subtopics=10, noise=58.3%
mcs= 605, min_samples=45, eps=0.02  →  subtopics=11, noise=65.0%
mcs= 605, min_samples=50, eps=0.02  →  subtopics=10, noise=57.6%
mcs= 757, min_samples=40, eps=0.02  →  subtopics= 7, noise=51.2%
mcs= 757, min_samples=45, eps=0.02  →  subtopics= 2, noise=33.6%
mcs= 757, min_samples=50, eps=0.02  →  subtopics=10, noise=57.6%
mcs=1009, min_samples=40, eps=0.02  →  subtopics= 7, noise=51.2%
mcs=1009, min_samples=45, eps=0.02  →  subtopics= 7, noise=58.7%
mcs=1009, min_samples=50, eps=0.02  →  subtopics= 7, noise=46.9%

Best by (min noise, then fewer clusters):
mcs=757, min_samples=45, eps=0.02  →  subtopics=2, noise=33.6%

## Topic Visualizations

In [135]:
# Overall map of topics

topic_model.visualize_topics()

TypeError: argument of type 'NoneType' is not iterable

In [84]:
# Visualize top words per topic (bar charts)

topic_model.visualize_barchart(top_n_topics=20)

In [75]:
# Visualizie topic similarity heatmap

topic_model.visualize_heatmap()

In [76]:
# Visualize hierarchy (dendrogram)

topic_model.visualize_hierarchy()

## Save topics

## Random extras

In [228]:
# Fit BERTopic model

for min_dist in [0.185,0.19,0.195]:

    print(f"min_dist is set to: {min_dist}")
    start = time.time()
    
    topic_model, topics, probs = run_bertopic(docs, embs, st_model, min_dist = min_dist)
    
    end = time.time()
    
    runtime = (end-start)/60
    
    print(f"Full runtime: {runtime:.2f} minutes.\n\n\n")

min_dist is set to: 0.185


NameError: name 'run_bertopic' is not defined

2025-09-06 21:05:09,931 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
min_dist is set to: 0.185
2025-09-06 21:06:58,259 - BERTopic - Dimensionality - Completed ✓
2025-09-06 21:06:58,262 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-06 21:07:12,363 - BERTopic - Cluster - Completed ✓
2025-09-06 21:07:12,388 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-09-06 21:07:23,953 - BERTopic - Representation - Completed ✓
Discovered topics (excl. -1): 2
Outlier docs (-1): 8600 / 56574 = 15.2%
Topic	Count
0	-1	8600
1	0	46303
2	1	1671
Full runtime: 2.24 minutes.



min_dist is set to: 0.19
2025-09-06 21:07:24,387 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-06 21:09:08,456 - BERTopic - Dimensionality - Completed ✓
2025-09-06 21:09:08,460 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-06 21:09:19,490 - BERTopic - Cluster - Completed ✓
2025-09-06 21:09:19,505 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-09-06 21:09:30,357 - BERTopic - Representation - Completed ✓
Discovered topics (excl. -1): 3
Outlier docs (-1): 12082 / 56574 = 21.4%
Topic	Count
0	-1	12082
1	0	41480
2	1	1704
3	2	1308
Full runtime: 2.11 minutes.



min_dist is set to: 0.195
2025-09-06 21:09:30,877 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-06 21:11:09,888 - BERTopic - Dimensionality - Completed ✓
2025-09-06 21:11:09,908 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-06 21:11:22,767 - BERTopic - Cluster - Completed ✓
2025-09-06 21:11:22,781 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-09-06 21:11:34,157 - BERTopic - Representation - Completed ✓
Discovered topics (excl. -1): 2
Outlier docs (-1): 3776 / 56574 = 6.7%
Topic	Count
0	-1	3776
1	0	51092
2	1	1706
Full runtime: 2.06 minutes.

Discovered topics (excl. -1): 2
Outlier docs (-1): 9566 / 56574 = 16.9%
Topic	Count
0	-1	9566
1	0	45320
2	1	1688
Number of neighbors is set to 5
Full runtime: 1.03 minutes.



Discovered topics (excl. -1): 2
Outlier docs (-1): 2905 / 56574 = 5.1%
Topic	Count
0	-1	2905
1	0	51880
2	1	1789
Number of neighbors is set to 15
Full runtime: 1.42 minutes.



Discovered topics (excl. -1): 3
Outlier docs (-1): 7356 / 56574 = 13.0%
Topic	Count
0	-1	7356
1	0	46053
2	1	1744
3	2	1421
Number of neighbors is set to 18
Full runtime: 1.54 minutes.



Discovered topics (excl. -1): 2
Outlier docs (-1): 4988 / 56574 = 8.8%
Topic	Count
0	-1	4988
1	0	49844
2	1	1742
Number of neighbors is set to 20
Full runtime: 1.61 minutes.



Discovered topics (excl. -1): 10
Outlier docs (-1): 31751 / 56574 = 56.1%
Topic	Count
0	-1	31751
1	0	6610
2	1	4385
3	2	2491
4	3	2204
5	4	1907
6	5	1744
7	6	1697
8	7	1672
9	8	1180
10	9	933
Number of neighbors is set to 25
Full runtime: 1.78 minutes.



Discovered topics (excl. -1): 11
Outlier docs (-1): 32210 / 56574 = 56.9%
Topic	Count
0	-1	32210
1	0	4203
2	1	3162
3	2	2414
4	3	2397
5	4	2395
6	5	2184
7	6	1922
8	7	1686
9	8	1600
10	9	1289
11	10	1112
Number of neighbors is set to 27
Full runtime: 1.84 minutes.



Discovered topics (excl. -1): 2
Outlier docs (-1): 5402 / 56574 = 9.5%
Topic	Count
0	-1	5402
1	0	49465
2	1	1707
Number of neighbors is set to 35
Full runtime: 2.08 minutes.
