## Preliminaries

In [1]:
# Imports

# General
import json
import zstandard as zstd
import pandas as pd
import numpy as np
import time
from datetime import datetime, timezone, timedelta
from typing import Optional

# Plotting
import matplotlib.pyplot as plt


# NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases, CoherenceModel
from gensim.models.phrases import Phraser
from gensim.corpora import Dictionary

In [2]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA"

In [52]:
# Download NLTK files (run once)

# nltk.download('stopwords')

## Helper functions 

In [3]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [4]:
# Function to get datetime from UTC timestamp

def dt_from_epoch(ts: Optional[int]):

    """
    Convert timestamp to pd.datetime format
    """

    
    if ts is None:
        return None
    return pd.to_datetime(ts, unit="s", utc=True)

In [5]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

In [6]:
# Function to get sample from text column

def get_text_samples(df: pd.DataFrame, text_col: str, n: int) -> None:

    '''
    Print n samples from a text column in a dataframe
    '''

    # Ensure pandas doesn't truncate text
    pd.set_option('display.max_colwidth', None)
    
    # Sample and print 5 full negative reviews
    print("Sample text data:\n\n")
    sample = df[text_col].sample(n)
    for i, description in enumerate(sample, 1):
        print(f"Text sample {i}:\n\n\n{description}\n\n\n")

In [7]:
# Function for categorical bar graph

def bar_graph(df: pd.DataFrame, col: str) -> None:

    """
    Generate bar graph for categorical column in a dataframe
    """

    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")

    counts = posts_df[col].value_counts()
    
    plt.figure(figsize=(10,6))
    counts.plot(kind="bar")
    plt.title(f"Distribution of {col.title()}")
    plt.xlabel(f"{col.title()}")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

In [8]:
# Define function to plot histogram for numeric columns

def histogram(df: pd.DataFrame, 
             col: str,
            bins: int = 30,
             log: bool = False) -> None:
    
    """
    Generate a histogram for a numeric column in a dataframe.
    """
    
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")
    
    plt.figure(figsize=(8, 5))
    df[col].dropna().hist(bins=bins, edgecolor="black", log=log)
    plt.title(f"Histogram of {col.title()}")
    plt.xlabel(col.title())
    plt.ylabel("Log(Frequency)" if log else "Frequency")
    plt.tight_layout()
    plt.show()

In [9]:
# Function to load ndjson

def load_plain_ndjson(path: str, limit: Optional[int] = None) -> pd.DataFrame:
    
    """
    Load a plain-text NDJSON file line by line into a DataFrame.
    """
    
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if not line.strip():
                continue
            obj = json.loads(line)

            rows.append({
                "post_id": obj.get("id"),
                "timestamp": dt_from_epoch(obj.get("created_utc")),
                "author": obj.get("author"),
                "title": obj.get("title"),
                "text": obj.get("selftext"),
                "score": obj.get("score"),
                "num_comments": obj.get("num_comments"),
                "permalink": obj.get("permalink"),
                "subreddit": obj.get("subreddit"),
            })

            if limit and i >= limit:
                break

    return pd.DataFrame(rows)

## Load and Inspect Data

In [10]:
# Load clean data

lulu_df = pd.read_parquet(f"{PATH}/lululemon_submissions_clean.parquet")

In [11]:
# Examine data

examine_df('lulu dataframe', lulu_df)



Number of records in the lulu dataframe is: 57984


Number of features in the lulu dataframe is: 6

The columns in the lulu dataframe are: Index(['post_id', 'timestamp', 'title', 'text', 'score', 'num_comments'], dtype='object')


 Other info about lulu dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57984 entries, 0 to 57983
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       57984 non-null  object             
 1   timestamp     57984 non-null  datetime64[ns, UTC]
 2   title         57984 non-null  object             
 3   text          57984 non-null  object             
 4   score         57984 non-null  int64              
 5   num_comments  57984 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(3)
memory usage: 2.7+ MB


None


 Basic statistical info about lulu dataframe:



Unnamed: 0,score,num_comments
count,57984.0,57984.0
mean,23.446071,14.705126
std,87.240166,40.279924
min,0.0,0.0
25%,1.0,2.0
50%,3.0,6.0
75%,13.0,13.0
max,11864.0,1987.0




Sample of records in the lulu dataframe:


Unnamed: 0,post_id,timestamp,title,text,score,num_comments
0,eielly,2020-01-01 05:33:25+00:00,Monthly Sales Post- January,FS: Aligns sz 4,1,7
1,eii06s,2020-01-01 12:46:35+00:00,Major problem falling down leggings?,"Hello, over the last year I have been ordering...",0,6
2,eijtca,2020-01-01 16:00:56+00:00,Tops for yoga,I have a couple swiftly tech racerbacks for ho...,3,4
3,eikiew,2020-01-01 16:59:27+00:00,ABC Pants - Sizing,"Hey all,\n\nI recently received ABC pants (siz...",1,6
4,eil4bb,2020-01-01 17:46:20+00:00,Certain Aligns colours with thicker fabric?,Hi lemonheads :D\n\nI was wondering if anyone ...,3,11


In [11]:
# Copy original dataframe

og_lulu_df = lulu_df.copy()

## Preprocessing

In [12]:
# Define stop word list, lemmatizer, and regex

# Stopwords
custom_stop_words = ['lululemon', 'lulu','amp','xx','lol', "like", "get", "got", "would", "anyone", "one"]  # Add any brand boilerplate tokens here

base_stops = set(stopwords.words("english"))
base_stops -= {"no", "nor", "not", "never"}       # Keep negations
stop_words = base_stops.union(custom_stop_words)

lemmatizer = WordNetLemmatizer()

# Precompile regex
_link = re.compile(r'https?://\S+|www\.\S+')
_nonalpha = re.compile(r'[^a-z\s]')
_spaces = re.compile(r'\s+')

In [13]:
# Define text preprocessor

def preprocess(text: str) -> str:

    """
    Preprocess text before modeling
    """
    
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = _link.sub(" ", text)         # Remove links
    text = _nonalpha.sub(" ", text)     # Keep only letters/spaces
    tokens = []
    for t in text.split():
        if t in stop_words or len(t) < 3:
            continue
        t = lemmatizer.lemmatize(t)
        tokens.append(t)
    return _spaces.sub(" ", " ".join(tokens)).strip()

In [14]:
# Apply text preprocessing

lulu_df["clean_text"] = lulu_df["title"].fillna("") + " " + lulu_df["text"].fillna("")
lulu_df["clean_text"] = lulu_df["clean_text"].apply(preprocess)

# drop docs with <5 tokens to reduce noise
lulu_df = lulu_df[lulu_df["clean_text"].str.split().str.len() >= 5].reset_index(drop=True)

In [73]:
# Check some examples

get_text_samples(lulu_df, 'clean_text', 5)

Sample text data:


Text sample 1:


hemming wunder train short fix sausage leg issue know hem bring store hemming tights top read leg opening squeeze even ppl skinny leg wondering hemming cut higher wider point solve issue tried checked pair sale



Text sample 2:


exchange policy lemon change policy exchange gave hard time exchangeinf pair pant anyway made show receipt



Text sample 3:


hat golf fianc birthday coming mentioned needing new hat golf currently lll suggestion thanks



Text sample 4:


define hooded jacket nulu question define hooded nulu jacket length wise regular define jacket found size regular define size cropped define



Text sample 5:


otf crop otf short weird sizing otf crop short luxtreme arrived today loved style colour fit super baggy front area looked camel toe extra baggy material front crotch area size style normally bottom may order instead





## Build Dictionary and Corpus

In [15]:
# Collect tokenized docs

tokenized_docs = [doc.split() for doc in lulu_df["clean_text"]]

In [16]:
# Build bigram detector

bigram = Phrases(tokenized_docs, min_count=10, threshold=10)  
bigram_mod = Phraser(bigram)

# Apply the trained model
tokenized_bigrams = [bigram_mod[doc] for doc in tokenized_docs]

In [17]:
# Add bigram text back to dataframe

lulu_df["clean_text_bigram"] = [" ".join(doc) for doc in tokenized_bigrams]

In [78]:
# Check some examples

get_text_samples(lulu_df, 'clean_text_bigram', 5)

Sample text data:


Text sample 1:


desert_teal come view_poll



Text sample 2:


cropped full_length deleted_view poll



Text sample 3:


wear date_brown fell_love date_brown wunder_train store idea color pair dark blue navy night_sea black horrible color coordinating unless easy color color pair date_brown



Text sample 4:


align keyhole strappy racerback mesh_panel tight obsessed item



Text sample 5:


lay employee following growth slowdown fear weighing stock two consecutive year increasing annual sale management see sale growing current fiscal year largely thanks challenging consumer environment shift consumer behavior late navigating slower start year market ceo calvin mcdonald said march earnings call





In [18]:
# Collect tokenized docs with bigrams

tokenized_docs = [doc.split() for doc in lulu_df["clean_text_bigram"]]

In [19]:
# Create dictionary mapping

dictionary = Dictionary(tokenized_docs)

In [20]:
# Filter extremes to keep vocab manageable

dictionary.filter_extremes(no_below=30, no_above=0.5)  

# no_below = n keep words that appear in at least n docs (too rare)
# no_above = m → drop words that appear in > m% of docs (too common)

print(f"Dictionary size: {len(dictionary)} unique tokens")

Dictionary size: 4754 unique tokens


In [21]:
# Convert docs to Bag-of-Words using the dictionary

corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]


In [58]:
# Inspect the first document’s representation

print("First doc BoW:", corpus[0])
print("Decoded:", [(dictionary[id], freq) for id, freq in corpus[0]])


First doc BoW: [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
Decoded: [('aligns', 1), ('january', 1), ('monthly', 1), ('post', 1), ('sale', 1)]


## Modeling with Latent Dirichlet Allocation

In [22]:
# Function to test coherence for different numbers of topics

def coherence_test(dictionary, corpus, texts, limit, start=5, step=5):

    """
    Define function to evaluate LDA model with coherence metric
    """

    full_start = time.time()
    
    results = []
    for num_topics in range(start, limit+1, step):

        start = time.time()
        
        lda = LdaModel(corpus=corpus,
                       id2word=dictionary,
                       num_topics=num_topics,
                       passes=10,
                       random_state=42)
        coherence_model = CoherenceModel(model=lda, texts=texts,
                                         dictionary=dictionary, coherence="c_v")
        
        coherence = coherence_model.get_coherence()
        
        results.append((num_topics, coherence))

        end = time.time()
        runtime = (end - start)/60
        
        print(f"Topics: {num_topics}, Coherence: {coherence:.4f}, Runtime: {runtime:.2f} minutes\n\n")
    
    full_end = time.time()
    runtime = (full_end - full_start)/60

    print(f"\nFull coherence test completed in {runtime:.2f} minutes.")

    return results

In [23]:
# Test different topic counts in terms of coherence

results = coherence_test(dictionary, corpus, tokenized_bigrams, start= 5 , limit=25, step=1)

Topics: 5, Coherence: 0.5289, Runtime: 3.27 minutes


Topics: 6, Coherence: 0.5501, Runtime: 3.39 minutes


Topics: 7, Coherence: 0.5097, Runtime: 3.32 minutes


Topics: 8, Coherence: 0.5351, Runtime: 3.36 minutes


Topics: 9, Coherence: 0.5346, Runtime: 3.39 minutes


Topics: 10, Coherence: 0.6016, Runtime: 3.30 minutes


Topics: 11, Coherence: 0.5253, Runtime: 3.19 minutes


Topics: 12, Coherence: 0.5793, Runtime: 3.20 minutes


Topics: 13, Coherence: 0.5608, Runtime: 3.39 minutes


Topics: 14, Coherence: 0.5491, Runtime: 3.19 minutes


Topics: 15, Coherence: 0.5608, Runtime: 3.30 minutes


Topics: 16, Coherence: 0.5716, Runtime: 3.29 minutes


Topics: 17, Coherence: 0.5345, Runtime: 3.27 minutes


Topics: 18, Coherence: 0.5287, Runtime: 3.36 minutes


Topics: 19, Coherence: 0.5541, Runtime: 3.23 minutes


Topics: 20, Coherence: 0.5130, Runtime: 3.29 minutes


Topics: 21, Coherence: 0.5550, Runtime: 3.27 minutes


Topics: 22, Coherence: 0.5125, Runtime: 3.28 minutes


Topics: 23, Coh

Results:

Topics: 5, Coherence: 0.5289, Runtime: 3.27 minutes


Topics: 6, Coherence: 0.5501, Runtime: 3.39 minutes


Topics: 7, Coherence: 0.5097, Runtime: 3.32 minutes


Topics: 8, Coherence: 0.5351, Runtime: 3.36 minutes


Topics: 9, Coherence: 0.5346, Runtime: 3.39 minutes


Topics: 10, Coherence: 0.6016, Runtime: 3.30 minutes


Topics: 11, Coherence: 0.5253, Runtime: 3.19 minutes


Topics: 12, Coherence: 0.5793, Runtime: 3.20 minutes


Topics: 13, Coherence: 0.5608, Runtime: 3.39 minutes


Topics: 14, Coherence: 0.5491, Runtime: 3.19 minutes


Topics: 15, Coherence: 0.5608, Runtime: 3.30 minutes


Topics: 16, Coherence: 0.5716, Runtime: 3.29 minutes


Topics: 17, Coherence: 0.5345, Runtime: 3.27 minutes


Topics: 18, Coherence: 0.5287, Runtime: 3.36 minutes


Topics: 19, Coherence: 0.5541, Runtime: 3.23 minutes


Topics: 20, Coherence: 0.5130, Runtime: 3.29 minutes


Topics: 21, Coherence: 0.5550, Runtime: 3.27 minutes


Topics: 22, Coherence: 0.5125, Runtime: 3.28 minutes


Topics: 23, Coherence: 0.5115, Runtime: 3.31 minutes


Topics: 24, Coherence: 0.5296, Runtime: 3.30 minutes


Topics: 25, Coherence: 0.5327, Runtime: 3.32 minutes



Full coherence test completed in 69.20 minutes.

In [88]:
# Test different topic counts in terms of coherence

results = coherence_test(dictionary, corpus, tokenized_bigrams, start= 10 , limit=20, step=5)

Topics: 10, Coherence: 0.6016, Runtime: 3.74 minutes


Topics: 15, Coherence: 0.5608, Runtime: 3.75 minutes


Topics: 20, Coherence: 0.5123, Runtime: 3.71 minutes



Full coherence test completed in 11.20 minutes.


In [None]:
# Test other different topic counts in terms of coherence

results_01 = coherence_test(dictionary, corpus, tokenized_bigrams, start= 7 , limit=13, step=2)

Topics: 7, Coherence: 0.5097, Runtime: 3.23 minutes


Topics: 9, Coherence: 0.5346, Runtime: 3.23 minutes




In [60]:
# Fit LDA Model

start = time.time()

lda = LdaModel(
    corpus = corpus,
    id2word = dictionary,
    num_topics = 10,   # Tune
    passes = 10,
    random_state = 2025
)

end = time.time()

runtime = (end - start)/60

print(f"LDA model fit in {runtime:.2f} minutes.")

LDA model fit in 3.414894700050354 minutes.


In [62]:
# Show topics

for idx, topic in lda.print_topics(num_topics=10, num_words=10):
    print(f"Topic {idx}: {topic}\n\n")

Topic 0: 0.012*"work" + 0.011*"wear" + 0.010*"not" + 0.010*"get" + 0.010*"wash" + 0.009*"would" + 0.009*"use" + 0.009*"like" + 0.008*"day" + 0.008*"need"


Topic 1: 0.045*"size" + 0.032*"legging" + 0.023*"fit" + 0.012*"like" + 0.011*"would" + 0.011*"wunder_train" + 0.011*"align" + 0.010*"wear" + 0.010*"get" + 0.010*"waist"


Topic 2: 0.041*"color" + 0.026*"anyone" + 0.018*"look" + 0.016*"like" + 0.015*"find" + 0.014*"see" + 0.014*"pic" + 0.013*"photo" + 0.013*"item" + 0.012*"picture"


Topic 3: 0.046*"black" + 0.036*"align" + 0.020*"white" + 0.015*"short" + 0.012*"high_rise" + 0.012*"skirt" + 0.012*"color" + 0.012*"bra" + 0.011*"wunder_train" + 0.011*"espresso"


Topic 4: 0.134*"size" + 0.038*"bra" + 0.031*"fit" + 0.026*"top" + 0.018*"tank" + 0.014*"shirt" + 0.014*"align" + 0.012*"sizing" + 0.011*"align_tank" + 0.010*"wear"


Topic 5: 0.141*"short" + 0.074*"pant" + 0.038*"men" + 0.026*"jogger" + 0.023*"softstreme" + 0.020*"woman" + 0.018*"logo" + 0.017*"shirt" + 0.015*"length" + 0.015*

## Modeling with BERTopic