# Topic Modeling with BERTopic

In this notebook, we undergo topic modeling with the state of the art language model BERTopic. After hyperparameter tuning, we select an optimal choice of topics.

## Preliminaries

In [1]:
# Imports

# ============ General ============
import json
import pandas as pd
import numpy as np
import time
from datetime import datetime, timezone, timedelta
from typing import Optional, List
import math
from itertools import chain

# ============ Plotting ============
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

# ============ Text Preprocessing  ============
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# spaCy for lemmatization/POS filtering
try:
    import spacy
    _SPACY_AVAILABLE = True
except ImportError:
    _SPACY_AVAILABLE = False

# ============ BERTopic stack ============
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# Repro + warnings
import random
import warnings
warnings.filterwarnings("ignore")
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Ensure NLTK resources are available
# for _res in ["stopwords", "wordnet", "omw-1.4", "punkt"]:
#     try:
#         nltk.data.find(f"corpora/{_res}")
#     except LookupError:
#         nltk.download(_res)


In [2]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA"

In [3]:
# Download NLTK files (run once)

# nltk.download('stopwords')

## Helper functions 

In [4]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [5]:
# Function to get datetime from UTC timestamp

def dt_from_epoch(ts: Optional[int]):

    """
    Convert timestamp to pd.datetime format
    """

    
    if ts is None:
        return None
    return pd.to_datetime(ts, unit="s", utc=True)

In [6]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

In [7]:
# Function to get sample from text column

def get_text_samples(df: pd.DataFrame, text_col: str, n: int) -> None:

    '''
    Print n samples from a text column in a dataframe
    '''

    # Ensure pandas doesn't truncate text
    pd.set_option('display.max_colwidth', None)
    
    # Sample and print 5 full negative reviews
    print("Sample text data:\n\n")
    sample = df[text_col].sample(n)
    for i, description in enumerate(sample, 1):
        print(f"Text sample {i}:\n\n\n{description}\n\n\n")

In [8]:
# Function for categorical bar graph

def bar_graph(df: pd.DataFrame, col: str) -> None:

    """
    Generate bar graph for categorical column in a dataframe
    """

    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")

    counts = posts_df[col].value_counts()
    
    plt.figure(figsize=(10,6))
    counts.plot(kind="bar")
    plt.title(f"Distribution of {col.title()}")
    plt.xlabel(f"{col.title()}")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

In [9]:
# Define function to plot histogram for numeric columns

def histogram(df: pd.DataFrame, 
             col: str,
            bins: int = 30,
             log: bool = False) -> None:
    
    """
    Generate a histogram for a numeric column in a dataframe.
    """
    
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")
    
    plt.figure(figsize=(8, 5))
    df[col].dropna().hist(bins=bins, edgecolor="black", log=log)
    plt.title(f"Histogram of {col.title()}")
    plt.xlabel(col.title())
    plt.ylabel("Log(Frequency)" if log else "Frequency")
    plt.tight_layout()
    plt.show()

In [10]:
# Function to load ndjson

def load_plain_ndjson(path: str, limit: Optional[int] = None) -> pd.DataFrame:
    
    """
    Load a plain-text NDJSON file line by line into a DataFrame.
    """
    
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if not line.strip():
                continue
            obj = json.loads(line)

            rows.append({
                "post_id": obj.get("id"),
                "timestamp": dt_from_epoch(obj.get("created_utc")),
                "author": obj.get("author"),
                "title": obj.get("title"),
                "text": obj.get("selftext"),
                "score": obj.get("score"),
                "num_comments": obj.get("num_comments"),
                "permalink": obj.get("permalink"),
                "subreddit": obj.get("subreddit"),
            })

            if limit and i >= limit:
                break

    return pd.DataFrame(rows)

## Load and Inspect Data

In [11]:
# Load clean data

lulu_df = pd.read_parquet(f"{PATH}/lululemon_submissions_clean.parquet", engine = 'fastparquet')

In [12]:
# Examine data

examine_df('lulu dataframe', lulu_df)



Number of records in the lulu dataframe is: 57984


Number of features in the lulu dataframe is: 6

The columns in the lulu dataframe are: Index(['post_id', 'timestamp', 'title', 'text', 'score', 'num_comments'], dtype='object')


 Other info about lulu dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57984 entries, 0 to 57983
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       57984 non-null  object             
 1   timestamp     57984 non-null  datetime64[ns, UTC]
 2   title         57984 non-null  object             
 3   text          57984 non-null  object             
 4   score         57984 non-null  int64              
 5   num_comments  57984 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(3)
memory usage: 2.7+ MB


None


 Basic statistical info about lulu dataframe:



Unnamed: 0,score,num_comments
count,57984.0,57984.0
mean,23.446071,14.705126
std,87.240166,40.279924
min,0.0,0.0
25%,1.0,2.0
50%,3.0,6.0
75%,13.0,13.0
max,11864.0,1987.0




Sample of records in the lulu dataframe:


Unnamed: 0,post_id,timestamp,title,text,score,num_comments
0,eielly,2020-01-01 05:33:25+00:00,Monthly Sales Post- January,FS: Aligns sz 4,1,7
1,eii06s,2020-01-01 12:46:35+00:00,Major problem falling down leggings?,"Hello, over the last year I have been ordering...",0,6
2,eijtca,2020-01-01 16:00:56+00:00,Tops for yoga,I have a couple swiftly tech racerbacks for ho...,3,4
3,eikiew,2020-01-01 16:59:27+00:00,ABC Pants - Sizing,"Hey all,\n\nI recently received ABC pants (siz...",1,6
4,eil4bb,2020-01-01 17:46:20+00:00,Certain Aligns colours with thicker fabric?,Hi lemonheads :D\n\nI was wondering if anyone ...,3,11


In [13]:
# Copy original dataframe

og_lulu_df = lulu_df.copy()

## Preprocessing

In [14]:
# Reset dataframe

lulu_df = og_lulu_df.copy()

In [15]:
# Define stop word list, lemmatizer, and regex

# Stopwords

custom_stop_words = [
    # brand/boilerplate
    "lululemon", "lulu", "amp", "xx", "lol",
    "like", "get", "got", "would", "anyone", "one",

    # deletion/removal artifacts
    "deleted", "remove", "removed", "removal",
    "deleted_view", "removed_view", "view_poll", "poll_view",
    "deleted_view_poll", "removed_view_poll",
    "view", "poll", "results", "result", "vote", "votes",
    "thread", "post", "posting", "posted", "comment", "comments",

    # generic low-information Reddit junk
    "http", "https", "www", "com",
    "imgur", "jpg", "png", "gif",
    "subreddit", "reddit", "mod", "mods",
    "link", "links",

    # Scraped filler 
    "user", "account", "profile",
    "page", "site", "website",
    "viewed", "views", "seen"
]

base_stops = set(stopwords.words("english"))
base_stops -= {"no", "nor", "not", "never"}       # Keep negations

stop_words = list(base_stops.union(custom_stop_words))

lemmatizer = WordNetLemmatizer()

# Precompile regex
_link = re.compile(r'https?://\S+|www\.\S+')
_nonalpha = re.compile(r'[^a-z\s]')
_spaces = re.compile(r'\s+')

In [16]:
# Define text preprocessor

def preprocess(text: str) -> str:

    """
    Preprocess text before modeling
    """
    
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = _link.sub(" ", text)         # Remove links
    text = _nonalpha.sub(" ", text)     # Keep only letters/spaces
    tokens = []
    for t in text.split():
        if t in stop_words or len(t) < 3:
            continue
        t = lemmatizer.lemmatize(t)
        tokens.append(t)
    return _spaces.sub(" ", " ".join(tokens)).strip()

In [17]:
# Apply text preprocessing

lulu_df["clean_text"] = lulu_df["title"].fillna("") + " " + lulu_df["text"].fillna("")
lulu_df["clean_text"] = lulu_df["clean_text"].apply(preprocess)

# drop docs with <5 tokens to reduce noise
lulu_df = lulu_df[lulu_df["clean_text"].str.split().str.len() >= 5].reset_index(drop=True)

In [18]:
# Check some examples

get_text_samples(lulu_df, 'clean_text', 5)

Sample text data:


Text sample 1:


aligned midi dress recommendation size work appropriate anybody aligned midi dress wmtm canada brunch back dress wore work hooked love dress another cut colour wear work slit back crazy high anybody love tried purchase



Text sample 2:


align short restock alert align short black back stock size look restocked size



Text sample 3:


cat owner furry friend snag life little rant try not hold cat wearing anything snag easily swiftlys aligns define jacket name piece wear cuddle cat scuba since thicker guess putting zip scuba today noticed many snag snag actually ripped meaning loop happens get snagged get ripped half make sense love cat death anywhere near wearing lmao picture little destroyer



Text sample 4:


dark lavender best colour flattering skin tone vibrant still classic received softstreme set poshmark add collection excited never coloured legging not look double lined dark enough not show much texture love sweetheart bra aligns align sho

In [19]:
# Collect tokenized docs

tokenized_docs = [doc.split() for doc in lulu_df["clean_text"]]

In [20]:
# Function to merge bigrams

def merge_bigrams(doc, bigram_set):
    
    """
    Merge bigrams
    """
    
    merged = []
    i = 0
    while i < len(doc):
        if i < len(doc)-1 and (doc[i], doc[i+1]) in bigram_set:
            merged.append(f"{doc[i]}_{doc[i+1]}")
            i += 2
        else:
            merged.append(doc[i])
            i += 1
    return merged

In [21]:
# Apply bigram detector

bigrams = [list(ngrams(doc, 2)) for doc in tokenized_docs]
flat_bigrams = [bg for doc in bigrams for bg in doc]
bigram_counts = Counter(flat_bigrams)
common_bigrams = {bg for bg, count in bigram_counts.items() if count >= 10}

tokenized_bigrams = [merge_bigrams(doc, common_bigrams) for doc in tokenized_docs]

In [22]:
# Add bigram text back to dataframe

lulu_df["clean_text_bigram"] = [" ".join(doc) for doc in tokenized_bigrams]

In [23]:
# Check some examples

get_text_samples(lulu_df, 'clean_text_bigram', 5)

Sample text data:


Text sample 1:


know totally_different could_keep considering versatility item_work weather justification welcome



Text sample 2:


loving black_fleece ebb gold really pop black great every_day bag



Text sample 3:


question dude male athletic_build love clothing seems woman_pant tights greater quality men seem thicker nicer city_sweat discipline surge pant_fit great discipline seem borderline resemble quality yoga_pant kinda wishing take woman_line add fabric_not tight_also feel_way



Text sample 4:


wonder chrome_aligns price_drop since one buying sorry another_align



Text sample 5:


ootd twilight_rose java half sleeve close_body shelf shirt java_size cinchable_waist high_rise woven_short size twilight_rose shirt soooo_soft love shelf_bra beautiful open_back short comfy





In [24]:
# Get list of docs

docs = lulu_df["clean_text_bigram"].tolist()

## Modeling with BERTopic

In [33]:
# Function to examine basic topic info

def examine_topics(topic_model, topics, probs):
    
    # Topic table and basic stats
    topic_info = topic_model.get_topic_info()
    topic_info.head(10)
    
    # Number of discovered topics (exclude -1 = outliers)
    n_topics = int((topic_info["Topic"] != -1).sum())
    n_docs = len(docs)
    outlier_share = (topics.count(-1) / n_docs) if n_docs else 0.0
    
    print(f"Discovered topics (excl. -1): {n_topics}")
    print(f"Outlier docs (-1): {topics.count(-1)} / {n_docs} = {outlier_share:.1%}")
    
    # Topic size distribution
    display(topic_info[["Topic", "Count"]].head(20))

In [35]:
# Function to check top words for each topic


def check_top_words(topic_model, topics, probs):
    # Refresh topic_info from the current model
    topic_info = topic_model.get_topic_info().copy()
    
    # Get all non-noise topics
    topic_ids = topic_info.loc[topic_info["Topic"] != -1, "Topic"].tolist()
    
    for t in topic_ids:
        # safe size lookup
        size = int(topic_info.loc[topic_info["Topic"] == t, "Count"].iloc[0])
        print("="*80)
        print(f"\nTopic {t} | size={size}")
    
        # top words (guard if topic exists but empty)
        words = topic_model.get_topic(t) or []
        top_terms = ", ".join([w for w, _ in words[:15]]) if words else "(no terms)"
        print("\nTop words:", top_terms)
    
        # representative examples (guard empty)
        reps = (topic_model.get_representative_docs(t) or [])[:3]
        for i, doc in enumerate(reps, 1):
            preview = doc[:300].replace("\n", " ")
            suffix = "..." if len(doc) > 300 else ""
            print(f"\n--- Ex{i}: {preview}{suffix}")
    
    print("\nNon-noise topics:", topic_ids)


In [25]:
# Load baseline sentence embedding model

EMB_NAME = "all-MiniLM-L6-v2"
st_model = SentenceTransformer(EMB_NAME)

In [26]:
# Generate embeddings

embs = st_model.encode(
    docs,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True  # Normalizes vectors for cosine sim
)

Batches:   0%|          | 0/1768 [00:00<?, ?it/s]

In [49]:
# Configure UMAP model

umap_model = UMAP(
    n_neighbors = 30,
    n_components=5,
    min_dist = 0.20,
    metric="cosine",
    random_state=42,
    verbose=False,
)

In [50]:
# Configure DBSCAN model

N = len(docs)

min_cluster_size = max(200, math.floor(0.015 * N))  # ≈0.5% of corpus, at least 50

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples= 35,  # defaults to min_cluster_size; set e.g. 10–30 to merge a bit more
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
    cluster_selection_epsilon = 0.00
)




In [51]:
# Define vectorizer to keep unigrams and bigrams

vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b[\w_]{3,}\b",
    stop_words = None,
        max_df = .9)


In [52]:
# Build BERTopic model

topic_model = BERTopic(
    embedding_model=st_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    language="english",
    calculate_probabilities=True,
    verbose=True,
    top_n_words=15
)

In [57]:
# Fit BERTopic model

start = time.time()
    
topic_model, topics, probs = run_bertopic(docs, embs, st_model)
    
end = time.time()
    
runtime = (end-start)/60
print(f"Full runtime: {runtime:.2f} minutes.\n\n\n")

Discovered topics (excl. -1): 11
Outlier docs (-1): 36449 / 56574 = 64.4%


Unnamed: 0,Topic,Count
0,-1,36449
1,0,3354
2,1,2206
3,2,2140
4,3,1998
5,4,1852
6,5,1825
7,6,1686
8,7,1604
9,8,1452


Full runtime: 2.14 minutes.





In [59]:
# Check top words

check_top_words(topic_model, topics, probs)


Topic 0 | size=3354

Top words: bra, sport_bra, energy_bra, cup, ebb_street, high_neck, boob, bra_size, strap, tank_top, cloud_bra, long_line, chest, free_serene, pad

--- Ex1: cup lady high_neck energy_bra tit absolutely none interested high_neck energy_bra cup lady could talk fit_usually size_bra top_wondering run_big cup thanks flat_chested friend advance

--- Ex2: prefer bra removable_cup built cup honestly prefer built cup worry adjusting usually see_outline pad either since built think_bra also combat uni boob providing shape structure wore bra built cup year_however brand mainly offer bra removable_pad give

--- Ex3: difference nonsale energy_bra longline cup wmtm cup energy_bra longline bought floral patterned energy_longline bra_wmtm today didnt pay_much attention noticing say_cup swear_saw bra yesterday full_price listed cup think error worried cup energy_bra longline know_fit definitely cup_version

Topic 1 | size=2206

Top words: gift_card, card, fedex, refund, shipped, sh

## Hyperparameter Tuning and Grid Searches

In [211]:
# Function for grid searches

def run_once(mcs, ms, eps=0.02):
    hdb = HDBSCAN(
        min_cluster_size=mcs,
        min_samples=ms,
        metric="euclidean",
        cluster_selection_method="eom",
        prediction_data=True,
        cluster_selection_epsilon=eps
    )
    vec = CountVectorizer(
        ngram_range=(1, 2),
        token_pattern=r"(?u)\b[\w_]{3,}\b",
        stop_words=None,
        min_df=1,
        max_df=1.0
    )
    mdl = BERTopic(
        umap_model=umap_sub,
        hdbscan_model=hdb,
        vectorizer_model=vec,
        calculate_probabilities=True,
        verbose=False
    )
    t, _ = mdl.fit_transform(docs_big, embs_big)
    info = mdl.get_topic_info()
    k = int((info.Topic != -1).sum())
    noise = float((np.array(t) == -1).mean())
    print(f"mcs={mcs:4d}, min_samples={ms:2d}, eps={eps:.2f}  →  subtopics={k:2d}, noise={noise:.1%}")
    return (k, noise, mdl, t, info)

In [58]:
# Hyper parameter tuning function

def run_bertopic(
    docs, embs, st_model,
    n_neighbors: int = 30,
    min_dist: float = 0.20,
    min_cluster_size: int | None = None,
    min_samples: int = 35,
    cluster_selection_epsilon: float = 0.00,
    random_state: int = 42,
    top_n_words: int = 15,
    method: str = 'leaf'
):
    """
    Build & fit a BERTopic model with simple, optional overrides for UMAP/HDBSCAN.
    Returns: topic_model, topics, probs
    """
    N = len(docs)
    if min_cluster_size is None:
        # your current heuristic (≈1.5% of corpus, at least 200)
        min_cluster_size = max(200, math.floor(0.015 * N))

    umap_model = UMAP(
        n_neighbors=n_neighbors,
        n_components=5,
        min_dist=min_dist,
        metric="cosine",
        random_state=random_state,
        verbose=False,
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric="euclidean",
        cluster_selection_method= method,
        prediction_data=True,
        cluster_selection_epsilon=cluster_selection_epsilon,
    )

    vectorizer_model = CountVectorizer(
        ngram_range=(1, 2),
        token_pattern=r"(?u)\b[\w_]{3,}\b",
        stop_words=None,
        max_df=0.9,
    )

    topic_model = BERTopic(
        embedding_model=st_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        language="english",
        calculate_probabilities=True,
        verbose=True,
        top_n_words=top_n_words,
    )

    topics, probs = topic_model.fit_transform(docs, embs)

    examine_topics(topic_model, topics, probs)
    
    return topic_model, topics, probs

In [208]:
# Grid search on epsilon and min sample says for HDBSCAN

# reuse docs_big, embs_big, umap_sub, M (len(docs_big))
min_cluster_size = max(200, int(0.015* M))  # ~1% of subset size

ms_values  = [25,30,35,40]
eps_values = [0.02]

results = []

for ms in ms_values:
    for eps in eps_values:
        hdb_sub = HDBSCAN(
            min_cluster_size=min_cluster_size,
            min_samples=ms,
            metric="euclidean",
            cluster_selection_method="eom",
            prediction_data=True,
            cluster_selection_epsilon=eps
        )

        vec_sub = CountVectorizer(
            ngram_range=(1, 2),
            token_pattern=r"(?u)\b[\w_]{3,}\b",
            stop_words=None,
            min_df=1,
            max_df=1.0
        )

        sub_model = BERTopic(
            umap_model=umap_sub,     # keep your existing UMAP config
            hdbscan_model=hdb_sub,
            vectorizer_model=vec_sub,
            calculate_probabilities=True,
            verbose=False
        )

        sub_topics, sub_probs = sub_model.fit_transform(docs_big, embs_big)
        sub_info = sub_model.get_topic_info()

        n_subtopics = (sub_info.Topic != -1).sum()
        noise_share = (np.array(sub_topics) == -1).mean()

        results.append((ms, eps, n_subtopics, noise_share))
        print(f"min_samples={ms:2d}, eps={eps:.2f} "
              f"-> subtopics={n_subtopics}, noise={noise_share:.1%}")


min_samples=25, eps=0.02 -> subtopics=10, noise=58.1%
min_samples=30, eps=0.02 -> subtopics=6, noise=52.0%
min_samples=35, eps=0.02 -> subtopics=9, noise=59.7%
min_samples=40, eps=0.02 -> subtopics=7, noise=51.2%


Grid Search Results:

min_samples=20, eps=0.00 -> subtopics=12, noise=55.5%
min_samples=20, eps=0.02 -> subtopics=12, noise=55.5%
min_samples=20, eps=0.05 -> subtopics=12, noise=55.5%
min_samples=25, eps=0.00 -> subtopics=11, noise=56.9% min_samples=25, eps=0.02 -> subtopics=10, noise=58.1%
min_samples=30, eps=0.02 -> subtopics=6, noise=52.0%
min_samples=35, eps=0.02 -> subtopics=9, noise=59.7%
min_samples=40, eps=0.02 -> subtopics=7, noise=51.2%

In [1]:
# Grid search on mcs_fracs, ms_values, and epsilon

M = len(docs_big)

# SMALL grid (aiming to reduce both cluster count and noise)
mcs_fracs = [0.010, 0.012, 0.015, 0.02]      # ≈ 1.0%, 1.2%, 1.5% of subset
ms_values = [40, 45, 50]           # around your previous best
epsilon   = 0.02                       # mild merge tolerance

best = {"noise": 1.0, "k": 10**9, "cfg": None, "model": None, "topics": None, "info": None}

for frac in mcs_fracs:
    mcs = max(120, int(frac * M))
    for ms in ms_values:
        k, noise, mdl, t, info = run_once(mcs, ms, epsilon)
        # choose best by: lowest noise, then fewer clusters
        if (noise < best["noise"]) or (noise == best["noise"] and k < best["k"]):
            best.update({"noise": noise, "k": k, "cfg": (mcs, ms, epsilon),
                         "model": mdl, "topics": t, "info": info})

print("\nBest by (min noise, then fewer clusters):")
print(f"mcs={best['cfg'][0]}, min_samples={best['cfg'][1]}, eps={best['cfg'][2]:.2f}  "
      f"→  subtopics={best['k']}, noise={best['noise']:.1%}")

# Keep best handy if you want to inspect words/examples next:
best_sub_model  = best["model"]
best_sub_topics = best["topics"]
best_sub_info   = best["info"]

NameError: name 'docs_big' is not defined

Grid Search Results:

mcs= 504, min_samples=40, eps=0.02  →  subtopics=12, noise=56.1%
mcs= 504, min_samples=45, eps=0.02  →  subtopics=13, noise=62.8%
mcs= 504, min_samples=50, eps=0.02  →  subtopics=12, noise=55.4%
mcs= 605, min_samples=40, eps=0.02  →  subtopics=10, noise=58.3%
mcs= 605, min_samples=45, eps=0.02  →  subtopics=11, noise=65.0%
mcs= 605, min_samples=50, eps=0.02  →  subtopics=10, noise=57.6%
mcs= 757, min_samples=40, eps=0.02  →  subtopics= 7, noise=51.2%
mcs= 757, min_samples=45, eps=0.02  →  subtopics= 2, noise=33.6%
mcs= 757, min_samples=50, eps=0.02  →  subtopics=10, noise=57.6%
mcs=1009, min_samples=40, eps=0.02  →  subtopics= 7, noise=51.2%
mcs=1009, min_samples=45, eps=0.02  →  subtopics= 7, noise=58.7%
mcs=1009, min_samples=50, eps=0.02  →  subtopics= 7, noise=46.9%

Best by (min noise, then fewer clusters):
mcs=757, min_samples=45, eps=0.02  →  subtopics=2, noise=33.6%

#### Optimized Run Through

In [60]:
# Fit BERTopic model 00

start = time.time()
    
topic_model_00, topics_00, probs_00 = run_bertopic(
    docs, embs, st_model,
    n_neighbors=15,          # tighter neighborhoods
    min_dist=0.08,           # spread clusters out just a bit
    min_cluster_size=int(0.01 * len(docs)),  # clusters absorb more points
    min_samples=20,          # relax noise labeling
    method="leaf"            # keep fine granularity
)
    
end = time.time()
    
runtime = (end-start)/60
print(f"Full runtime: {runtime:.2f} minutes.\n\n\n")

2025-09-06 22:06:53,121 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-06 22:08:04,096 - BERTopic - Dimensionality - Completed ✓
2025-09-06 22:08:04,101 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-06 22:08:18,447 - BERTopic - Cluster - Completed ✓
2025-09-06 22:08:18,466 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-09-06 22:08:30,283 - BERTopic - Representation - Completed ✓


Discovered topics (excl. -1): 19
Outlier docs (-1): 31075 / 56574 = 54.9%


Unnamed: 0,Topic,Count
0,-1,31075
1,0,2988
2,1,2374
3,2,2363
4,3,1990
5,4,1838
6,5,1784
7,6,1556
8,7,1307
9,8,1262


Full runtime: 1.64 minutes.





In [61]:
# Check top words

check_top_words(topic_model_00, topics_00, probs_00)


Topic 0 | size=2988

Top words: wunder_unders, hotty_hot, liner, waist, high_rise, sizing_help, thigh, waistband, leg, speed_ups, hotty_hots, hip, waist_hip, speed_short, wunder

--- Ex1: wunder_unders flattering flattering_make booty pop aligns_wunder train_love want_legging ancient_copper color_look ancient_copper wunder_unders size_fit wunder_unders flattering_legging thank_much advance

--- Ex2: hotty_hot high_rise usually_size low_rise hotty_hots recently_bought pair_high rise one felt extremely big happen_else size_size high_rise hotty_hots scared small

--- Ex3: hotty_hot brief liner size newbie question_people experience hotty_hot short_true size_sizing hotty_hot short_size despite sizing_find liner tends ride give_wedgie quite_often running_working problem larger butt_hip waist ratio wondering liner brief inch_hotty hots different worth_try tend_prefer hi...

Topic 1 | size=2374

Top words: gift_card, package, card, refund, fedex, email, shipped, shipping, gec, receipt, deliv

In [62]:
# Check topic diversity

TOP_N = 10
words_per_topic = {
    t: [w for w, _ in topic_model.get_topic(t)[:TOP_N]]
    for t in topic_info["Topic"].tolist() if t != -1
}

all_top_words = list(chain.from_iterable(words_per_topic.values()))
diversity = len(set(all_top_words)) / max(1, len(all_top_words))
print(f"Topic diversity (for top {TOP_N} words for each topic): {diversity:.3f}")

Topic diversity (for top 10 words for each topic): 1.000


## Topic Visualizations

In [66]:
# Overall map of topics

topic_model_00.visualize_topics()

In [67]:
# Visualize top words per topic (bar charts)

topic_model_00.visualize_barchart(top_n_topics=20)

In [75]:
# Visualizie topic similarity heatmap

topic_model_00.visualize_heatmap()

In [76]:
# Visualize hierarchy (dendrogram)

topic_model_00.visualize_hierarchy()

## Save topics

In [64]:
# Add topic labels and probabilities to original dataframe 

lulu_df["topic_00"] = topics_00
lulu_df["topic_prob_00"] = probs_00.max(axis=1)

In [65]:
# Save as parquet

lulu_df.to_parquet(f"{PATH}/lulu_df_with_topics_00.parquet", index=False)