## Preliminaries

In [72]:
# Imports

# ============ General ============
import json
import pandas as pd
import numpy as np
import time
from datetime import datetime, timezone, timedelta
from typing import Optional, List
import math
from itertools import chain

# ============ Plotting ============
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "browser"

# ============ Text Preprocessing  ============
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

# spaCy for lemmatization/POS filtering
try:
    import spacy
    _SPACY_AVAILABLE = True
except ImportError:
    _SPACY_AVAILABLE = False

# ============ BERTopic stack ============
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN

# Repro + warnings
import random
import warnings
warnings.filterwarnings("ignore")
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# Ensure NLTK resources are available
# for _res in ["stopwords", "wordnet", "omw-1.4", "punkt"]:
#     try:
#         nltk.data.find(f"corpora/{_res}")
#     except LookupError:
#         nltk.download(_res)


In [24]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA"

In [3]:
# Download NLTK files (run once)

# nltk.download('stopwords')

## Helper functions 

In [4]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [5]:
# Function to get datetime from UTC timestamp

def dt_from_epoch(ts: Optional[int]):

    """
    Convert timestamp to pd.datetime format
    """

    
    if ts is None:
        return None
    return pd.to_datetime(ts, unit="s", utc=True)

In [6]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

In [7]:
# Function to get sample from text column

def get_text_samples(df: pd.DataFrame, text_col: str, n: int) -> None:

    '''
    Print n samples from a text column in a dataframe
    '''

    # Ensure pandas doesn't truncate text
    pd.set_option('display.max_colwidth', None)
    
    # Sample and print 5 full negative reviews
    print("Sample text data:\n\n")
    sample = df[text_col].sample(n)
    for i, description in enumerate(sample, 1):
        print(f"Text sample {i}:\n\n\n{description}\n\n\n")

In [8]:
# Function for categorical bar graph

def bar_graph(df: pd.DataFrame, col: str) -> None:

    """
    Generate bar graph for categorical column in a dataframe
    """

    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")

    counts = posts_df[col].value_counts()
    
    plt.figure(figsize=(10,6))
    counts.plot(kind="bar")
    plt.title(f"Distribution of {col.title()}")
    plt.xlabel(f"{col.title()}")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

In [9]:
# Define function to plot histogram for numeric columns

def histogram(df: pd.DataFrame, 
             col: str,
            bins: int = 30,
             log: bool = False) -> None:
    
    """
    Generate a histogram for a numeric column in a dataframe.
    """
    
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")
    
    plt.figure(figsize=(8, 5))
    df[col].dropna().hist(bins=bins, edgecolor="black", log=log)
    plt.title(f"Histogram of {col.title()}")
    plt.xlabel(col.title())
    plt.ylabel("Log(Frequency)" if log else "Frequency")
    plt.tight_layout()
    plt.show()

In [10]:
# Function to load ndjson

def load_plain_ndjson(path: str, limit: Optional[int] = None) -> pd.DataFrame:
    
    """
    Load a plain-text NDJSON file line by line into a DataFrame.
    """
    
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if not line.strip():
                continue
            obj = json.loads(line)

            rows.append({
                "post_id": obj.get("id"),
                "timestamp": dt_from_epoch(obj.get("created_utc")),
                "author": obj.get("author"),
                "title": obj.get("title"),
                "text": obj.get("selftext"),
                "score": obj.get("score"),
                "num_comments": obj.get("num_comments"),
                "permalink": obj.get("permalink"),
                "subreddit": obj.get("subreddit"),
            })

            if limit and i >= limit:
                break

    return pd.DataFrame(rows)

## Load and Inspect Data

In [13]:
# Load clean data

lulu_df = pd.read_parquet(f"{PATH}/lululemon_submissions_clean.parquet", engine = 'fastparquet')

In [89]:
# Examine data

examine_df('lulu dataframe', lulu_df)



Number of records in the lulu dataframe is: 57984


Number of features in the lulu dataframe is: 6

The columns in the lulu dataframe are: Index(['post_id', 'timestamp', 'title', 'text', 'score', 'num_comments'], dtype='object')


 Other info about lulu dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57984 entries, 0 to 57983
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       57984 non-null  object             
 1   timestamp     57984 non-null  datetime64[ns, UTC]
 2   title         57984 non-null  object             
 3   text          57984 non-null  object             
 4   score         57984 non-null  int64              
 5   num_comments  57984 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(3)
memory usage: 2.7+ MB


None


 Basic statistical info about lulu dataframe:



Unnamed: 0,score,num_comments
count,57984.0,57984.0
mean,23.446071,14.705126
std,87.240166,40.279924
min,0.0,0.0
25%,1.0,2.0
50%,3.0,6.0
75%,13.0,13.0
max,11864.0,1987.0




Sample of records in the lulu dataframe:


Unnamed: 0,post_id,timestamp,title,text,score,num_comments
0,eielly,2020-01-01 05:33:25+00:00,Monthly Sales Post- January,FS: Aligns sz 4,1,7
1,eii06s,2020-01-01 12:46:35+00:00,Major problem falling down leggings?,"Hello, over the last year I have been ordering lululemon stuff online as there is no store nearby. I have been trying different sizes and overall I feel their leggings are pretty bad at holding up. Especially wunder under and all the right places. I have wu in luon which is fine, and aligns are ok, new in movement seem to be ok. But align and luon get pilling issues, in movement fabric gets dust/feather sticking to it after 2 wears. In the same the luxtreme fabric in wunder under.. impossible. I don't feel i will get in the smaller size, as my thighs are huge (smaller waist). Meanwhile I tried leggings from other brands... Eg Alo yoga, very similar model (extreme high waist airlift vs super high waist wu ) and the others are performing great. In alo i have leggings in s,m and l size,none is falling. I think i am getting really disappointed. It's very addictive to shop from lululemon and they look great in the mirror, but at the same time I cannot see the worth in comfort and performance during practice (yoga, both classical and more powerful,fitness) . Am I doing something wrong? I am afraid to invest even more $$, to try more models and risk having the same flows... I am also considering redesigning WU, adding a stich or elastic band to their waist..anyone having experience with that?",0,6
2,eijtca,2020-01-01 16:00:56+00:00,Tops for yoga,I have a couple swiftly tech racerbacks for hot yoga and just ordered a swiftly breeze tank because I’m wanting something with a bit more coverage (higher neckline). I find I’m adjusting my racerbacks more than I’d like in yoga but will keep wearing for hot yoga since they wick sweat so well. \n\nWhat are everyone’s favorite tops/tanks to wear for yoga? I’m a 34D so I worry about having enough coverage and not falling out of bras and tops. I have the free to be serene bra but I don’t like wearing it for yoga for this reason. Would love to find a great top (preferably a tank) that I don’t need to adjust or worry about with all the forward folds and down dogs. I knew lulu makes high neck bras but I don’t like the idea of something tight across my collarbone.,3,4
3,eikiew,2020-01-01 16:59:27+00:00,ABC Pants - Sizing,"Hey all,\n\nI recently received ABC pants (size 32) from the store, and they fit well. However, I’ve heard from multiple friends that they lose their shape and stretch out/can become baggy after a couple of weeks-months of wear. Should I go back to the store and try a size 31 in anticipation of this? \n\nIf not, will lululemon replace these pants should they lose their shape? \n\nThanks!",1,6
4,eil4bb,2020-01-01 17:46:20+00:00,Certain Aligns colours with thicker fabric?,"Hi lemonheads :D\n\nI was wondering if anyone has bought a pair of aligns that seems to be made from a thicker fabric than the usual Nulu. I think I saw a post a while ago where a few people thought their aligns were thicker, they were all dark red colour but I don't remember the exact name (not Garnet!)\n\nI recently purchased a pair of full length Aligns off Poshmark in the colour Graphite, but they also seem to be thicker, almost luon. I thought maybe they were WUs, but don't believe WUs come in that colour? I've attached a photo of the size dot (it looks real to me?), though I'm not sure how to read it or where I would check. Any advice or thoughts would be great! \n\nhttps://preview.redd.it/8zq5q04lf7841.jpg?width=3024&amp;format=pjpg&amp;auto=webp&amp;s=acaa777817edaeacc94b1effefd42b6f3d4f8c18",3,11


In [15]:
# Copy original dataframe

og_lulu_df = lulu_df.copy()

## Preprocessing

In [176]:
# Reset dataframe

lulu_df = og_lulu_df.copy()

In [177]:
# Define stop word list, lemmatizer, and regex

# Stopwords
custom_stop_words = [
    # brand/boilerplate
    "lululemon", "lulu", "amp", "xx", "lol",
    "like", "get", "got", "would", "anyone", "one",

    # deletion/removal artifacts
    "deleted", "remove", "removed", "removal",
    "deleted_view", "removed_view", "view_poll", "poll_view",
    "deleted_view_poll", "removed_view_poll",
    "view", "poll", "results", "result", "vote", "votes",
    "thread", "post", "posting", "posted", "comment", "comments",

    # generic low-information Reddit junk
    "http", "https", "www", "com",
    "imgur", "jpg", "png", "gif",
    "subreddit", "reddit", "mod", "mods",
    "link", "links",

    # Scraped filler 
    "user", "account", "profile",
    "page", "site", "website",
    "viewed", "views", "seen"
]

base_stops = set(stopwords.words("english"))
base_stops -= {"no", "nor", "not", "never"}       # Keep negations

stop_words = list(base_stops.union(custom_stop_words))

lemmatizer = WordNetLemmatizer()

# Precompile regex
_link = re.compile(r'https?://\S+|www\.\S+')
_nonalpha = re.compile(r'[^a-z\s]')
_spaces = re.compile(r'\s+')

In [178]:
# Define text preprocessor

def preprocess(text: str) -> str:

    """
    Preprocess text before modeling
    """
    
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = _link.sub(" ", text)         # Remove links
    text = _nonalpha.sub(" ", text)     # Keep only letters/spaces
    tokens = []
    for t in text.split():
        if t in stop_words or len(t) < 3:
            continue
        t = lemmatizer.lemmatize(t)
        tokens.append(t)
    return _spaces.sub(" ", " ".join(tokens)).strip()

In [179]:
# Apply text preprocessing

lulu_df["clean_text"] = lulu_df["title"].fillna("") + " " + lulu_df["text"].fillna("")
lulu_df["clean_text"] = lulu_df["clean_text"].apply(preprocess)

# drop docs with <5 tokens to reduce noise
lulu_df = lulu_df[lulu_df["clean_text"].str.split().str.len() >= 5].reset_index(drop=True)

In [162]:
# Check some examples

get_text_samples(lulu_df, 'clean_text', 5)

Sample text data:


Text sample 1:


iso asia fit angel foot tall fellow petite addict know hemming regular fit not work loose around ankle live sydney australia asia fit available look inch wunder unders longer stocked dying hand couple pair hoping someone help thank



Text sample 2:


intent jogger luon thinking picking pair



Text sample 3:


dollar giftcard contacted gec wanting exchange pair legging agent super precious decided offer egiftcard maybe inconvenience fee smt sent form fill normal never heard



Text sample 4:


sizing advice scuba full zip hoodies oversized regular fit long time lurker finally decided want hooded jacket autumn weather asking sizing advice full zip oversized full zip regular hoodie pretty short usually wear top also ideally hoodie cover waistband pant presentation might wanna throw top room cold regular fit look cover well proportion look throwing seems slim hold weight lower half muscly arm well not sure look good tight arm bottom near hip loose els

In [180]:
# Function to merge bigrams

def merge_bigrams(doc, bigram_set, stop_words = stop_words):
    """
    Merge bigrams, then remove any that are still in stop_words
    """
    merged = []
    i = 0
    while i < len(doc):
        if i < len(doc)-1 and (doc[i], doc[i+1]) in bigram_set:
            candidate = f"{doc[i]}_{doc[i+1]}"
            if candidate not in stop_words:
                merged.append(candidate)
            i += 2
        else:
            token = doc[i]
            if token not in stop_words:
                merged.append(token)
            i += 1
    return merged

In [181]:
# Collect tokenized docs

tokenized_docs = [doc.split() for doc in lulu_df["clean_text"]]

In [182]:
# Apply bigram detector

bigrams = [list(ngrams(doc, 2)) for doc in tokenized_docs]
flat_bigrams = [bg for doc in bigrams for bg in doc]
bigram_counts = Counter(flat_bigrams)

common_bigrams = {bg for bg, count in bigram_counts.items() if count >= 10}
tokenized_bigrams = [merge_bigrams(doc, common_bigrams) for doc in tokenized_docs]

In [184]:
# Add bigram text back to dataframe

lulu_df["clean_text_bigram"] = [" ".join(doc) for doc in tokenized_bigrams]

In [185]:
# Check some examples

get_text_samples(lulu_df, 'clean_text_bigram', 5)

Sample text data:


Text sample 1:


need_help fit_comparison ready_rulu jogger_align jogger_align jogger_size think_fit pretty_well not_much room butt ready_rulus compare align_jogger size_well size



Text sample 2:


visa card getting denied poking around sub determine possible sign pointed yes could_use visa_gift debit_card buy_stuff however keep_getting declined despite purchasing card today activated trying troubleshoot seems need assign billing_address visa card however went_back card add zip_code button nowhere_found guessing made stupid update regardless know workaround card accepted card purchased target back_card mybalancenow



Text sample 3:


lounge legging_curious else_disappointed current selection legging exception aligns_seem workout type_legging fairly similar hoping_find cotton lounge legging_maybe luon_material seems right everything slick spandexy material_aligns great super delicate everyday_wear moved away cotton loungy legging_seems almost_exclusively geared to

In [187]:
# Text used for embeddings: light cleaning only (keep function words, no underscores)

embed_docs = (
    lulu_df["title"].fillna("") + " " + lulu_df["text"].fillna("")
).str.replace(r'https?://\S+|www\.\S+', ' ', regex=True).str.strip()

# Text used for c-TF-IDF (vectorizer): your bigram-merged tokens

vectorizer_docs = lulu_df["clean_text_bigram"].tolist()

# Compute embeddings on embed_docs, but fit BERTopic on vectorizer_docs

embs = st_model.encode(embed_docs.tolist(),
                       batch_size=64, show_progress_bar=True, normalize_embeddings=True)

Batches: 100%|███████████████████████████████████████████████████████████████████████| 884/884 [11:22<00:00,  1.29it/s]


## Modeling with BERTopic: Phase One

In [233]:
# Load baseline sentence embedding model

EMB_NAME = "all-MiniLM-L6-v2"
st_model = SentenceTransformer(EMB_NAME)

In [273]:
# Configure backend UMAP + HDBSCAN models

# Larger min_cluster_size = merge small clusters
min_cluster_size = max(100, int(0.012 * N))  # 1% of corpus

umap_model = UMAP(
    n_neighbors=40,     # ↑ = broader neighborhood → merge clusters
    n_components=2,     # compress more tightly
    min_dist=0.05,      # slight spread so clusters don’t overfragment
    metric="cosine",
    random_state=42,
)

hdbscan_model = HDBSCAN(
    min_cluster_size=min_cluster_size,
    min_samples=35,     # ↑ = stricter cluster acceptance, fewer outliers
    metric="euclidean",
    cluster_selection_method="eom",
    cluster_selection_epsilon=0.1,  # gentle merging of border clusters
    prediction_data=True,
)




In [274]:
# Define vectorizer to keep unigrams and bigrams

vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    token_pattern=r"(?u)\b\w+\b",
    stop_words=stop_words,
    strip_accents="unicode",
    min_df=1,      # integer 1 is always safe when n_topics is small
    max_df=.95,    # or 0.95 if you still want to cap very common terms
    max_features=30000,
)


In [275]:
# Build BERTopic model

topic_model = BERTopic(
    embedding_model=st_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    language="english",
    calculate_probabilities=True,
    top_n_words=15,
    verbose=True,
    nr_topics=None    # let clustering decide
)

In [276]:
# Fit BERTopic model

start = time.time()

topics, probs = topic_model.fit_transform(vectorizer_docs, embs)

end = time.time()

runtime = (end-start)/60

print(f"Full runtime: {runtime:.2f} minutes.")

2025-09-06 14:31:07,838 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-06 14:32:46,541 - BERTopic - Dimensionality - Completed ✓
2025-09-06 14:32:46,544 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-06 14:32:54,307 - BERTopic - Cluster - Completed ✓
2025-09-06 14:32:54,324 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-09-06 14:33:02,628 - BERTopic - Representation - Completed ✓


Full runtime: 1.92 minutes.


In [277]:
# Examine basic topic info

# Topic table and basic stats
topic_info = topic_model.get_topic_info()
topic_info.head(10)

# Number of discovered topics (exclude -1 = outliers)
n_topics = int((topic_info["Topic"] != -1).sum())
n_docs = len(docs)
outlier_share = (topics.count(-1) / n_docs) if n_docs else 0.0

print(f"Discovered topics (excl. -1): {n_topics}")
print(f"Outlier docs (-1): {topics.count(-1)} / {n_docs} = {outlier_share:.1%}")

# Topic size distribution
topic_info[["Topic", "Count"]].head(20)

Discovered topics (excl. -1): 2
Outlier docs (-1): 0 / 56574 = 0.0%


Unnamed: 0,Topic,Count
0,0,55739
1,1,835


In [222]:
# Check topic diversity

TOP_N = 10
words_per_topic = {
    t: [w for w, _ in topic_model.get_topic(t)[:TOP_N]]
    for t in topic_info["Topic"].tolist() if t != -1
}

all_top_words = list(chain.from_iterable(words_per_topic.values()))
diversity = len(set(all_top_words)) / max(1, len(all_top_words))
print(f"Topic diversity (for top {TOP_N} words for each topic): {diversity:.3f}")

Topic diversity (for top 10 words for each topic): 1.000


In [221]:
# Check top words for each topic

# Top-k topics by size (excluding -1)
topk = topic_info.query("Topic != -1").head(10)["Topic"].tolist()

for t in topk:
    print("="*80)
    print(f"\nTopic {t} | size={int(topic_info.loc[topic_info['Topic']==t, 'Count'])}")
    print("\nTop words:", ", ".join([w for w, _ in topic_model.get_topic(t)[:15]]))
    
    # Representative examples
    reps = topic_model.get_representative_docs(t)[:3]
    for i, doc in enumerate(reps, 1):
        preview = doc[:300].replace("\n", " ")
        suffix = "..." if len(doc) > 300 else ""
        print(f"\n--- Ex{i}: {preview}{suffix}")



Topic 0 | size=50714

Top words: legging, pant, high_rise, aligns, jacket, bra, fast_free, tank, jogger, shirt, sizing, energy_bra, scuba, long_sleeve, white

--- Ex1: usa_drop drop_start time_today pst bra_align reversible_bra black tigre_camo deep_coal multi_ebb street_bra green_fern energy_bra chroma multi_flow bra green_fern free_serene bra_lavender dew free_serene inflorescence_multi free_wild lavender_dew run_time bra desert_sun top tee ripened_raspberry bac...

--- Ex2: collection review_wunder train instill align round collection step low_buy thought_share favourite attribute pro_con legging ive lucky acquire appreciating hopefully_help deciding style_wunder train recent legging_already became holy_grail material equally soft_align provides much_support not move w...

--- Ex3: wonder woman_wear legging took pace_breaker hemmed feel dont give men train leg shorter short besides fast_free deciding wear hottie_hots hiking leg_day thinking winter utah men compression_legging annoy

## Topic Visualizations

In [135]:
# Overall map of topics

topic_model.visualize_topics()

TypeError: argument of type 'NoneType' is not iterable

In [84]:
# Visualize top words per topic (bar charts)

topic_model.visualize_barchart(top_n_topics=20)

In [75]:
# Visualizie topic similarity heatmap

topic_model.visualize_heatmap()

In [76]:
# Visualize hierarchy (dendrogram)

topic_model.visualize_hierarchy()

In [85]:
# Save topics

df_out = lulu_df.copy()
df_out["topic"] = topics
df_out["topic_prob_max"] = [float(p.max()) if p is not None else None for p in probs]

# Optional: keep a cleaner subset for inspection
cols_show = ["topic", "topic_prob_max"]
if "title" in df_out:
    cols_show = ["title"] + cols_show
if "subreddit" in df_out.columns:
    cols_show = ["subreddit"] + cols_show
if "created_utc" in df_out.columns:
    cols_show = ["created_utc"] + cols_show

df_out[cols_show].head(10)

# Save artifacts
topic_info.to_csv(f"{PATH}/bertopic_topic_info_00.csv", index=False)
df_out.to_csv(f"{PATH}/reddit_posts_with_topics_00.csv", index=False)
print("Saved: bertopic_topic_info.csv, reddit_posts_with_topics.csv")

NameError: name 'df' is not defined