## Preliminaries

In [48]:
# Imports

# General
import json
import zstandard as zstd
import pandas as pd
import numpy as np
import time
from datetime import datetime, timezone, timedelta
from typing import Optional

# Plotting
import matplotlib.pyplot as plt

# NLP
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [2]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA/TORRENTED/reddit/subreddits24/lululemon_submissions"

In [16]:
# Download NLTK files (run once)

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\emshe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## Helper functions 

In [3]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [4]:
# Function to get datetime from UTC timestamp

def dt_from_epoch(ts: Optional[int]):

    """
    Convert timestamp to pd.datetime format
    """

    
    if ts is None:
        return None
    return pd.to_datetime(ts, unit="s", utc=True)

In [5]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

In [6]:
# Function to get sample from text column

def get_text_samples(df: pd.DataFrame, text_col: str, n: int) -> None:

    '''
    Print n samples from a text column in a dataframe
    '''

    # Ensure pandas doesn't truncate text
    pd.set_option('display.max_colwidth', None)
    
    # Sample and print 5 full negative reviews
    print("Sample text data:\n\n")
    sample = df[text_col].sample(n)
    for i, description in enumerate(sample, 1):
        print(f"Text sample {i}:\n\n\n{description}\n\n\n")

In [7]:
# Function for categorical bar graph

def bar_graph(df: pd.DataFrame, col: str) -> None:

    """
    Generate bar graph for categorical column in a dataframe
    """

    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")

    counts = posts_df[col].value_counts()
    
    plt.figure(figsize=(10,6))
    counts.plot(kind="bar")
    plt.title(f"Distribution of {col.title()}")
    plt.xlabel(f"{col.title()}")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

In [8]:
# Define function to plot histogram for numeric columns

def histogram(df: pd.DataFrame, 
             col: str,
            bins: int = 30,
             log: bool = False) -> None:
    
    """
    Generate a histogram for a numeric column in a dataframe.
    """
    
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")
    
    plt.figure(figsize=(8, 5))
    df[col].dropna().hist(bins=bins, edgecolor="black", log=log)
    plt.title(f"Histogram of {col.title()}")
    plt.xlabel(col.title())
    plt.ylabel("Log(Frequency)" if log else "Frequency")
    plt.tight_layout()
    plt.show()

In [9]:
# Function to load ndjson

def load_plain_ndjson(path: str, limit: Optional[int] = None) -> pd.DataFrame:
    
    """
    Load a plain-text NDJSON file line by line into a DataFrame.
    """
    
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            if not line.strip():
                continue
            obj = json.loads(line)

            rows.append({
                "post_id": obj.get("id"),
                "timestamp": dt_from_epoch(obj.get("created_utc")),
                "author": obj.get("author"),
                "title": obj.get("title"),
                "text": obj.get("selftext"),
                "score": obj.get("score"),
                "num_comments": obj.get("num_comments"),
                "permalink": obj.get("permalink"),
                "subreddit": obj.get("subreddit"),
            })

            if limit and i >= limit:
                break

    return pd.DataFrame(rows)

## Load and Inspect Data

In [21]:
# Load clean data

lulu_df = pd.read_parquet("lululemon_submissions_clean.parquet")

In [11]:
# Examine data

examine_df('lulu dataframe', lulu_df)



Number of records in the lulu dataframe is: 57984


Number of features in the lulu dataframe is: 6

The columns in the lulu dataframe are: Index(['post_id', 'timestamp', 'title', 'text', 'score', 'num_comments'], dtype='object')


 Other info about lulu dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57984 entries, 0 to 57983
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       57984 non-null  object             
 1   timestamp     57984 non-null  datetime64[ns, UTC]
 2   title         57984 non-null  object             
 3   text          57984 non-null  object             
 4   score         57984 non-null  int64              
 5   num_comments  57984 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(3)
memory usage: 2.7+ MB


None


 Basic statistical info about lulu dataframe:



Unnamed: 0,score,num_comments
count,57984.0,57984.0
mean,23.446071,14.705126
std,87.240166,40.279924
min,0.0,0.0
25%,1.0,2.0
50%,3.0,6.0
75%,13.0,13.0
max,11864.0,1987.0




Sample of records in the lulu dataframe:


Unnamed: 0,post_id,timestamp,title,text,score,num_comments
0,eielly,2020-01-01 05:33:25+00:00,Monthly Sales Post- January,FS: Aligns sz 4,1,7
1,eii06s,2020-01-01 12:46:35+00:00,Major problem falling down leggings?,"Hello, over the last year I have been ordering...",0,6
2,eijtca,2020-01-01 16:00:56+00:00,Tops for yoga,I have a couple swiftly tech racerbacks for ho...,3,4
3,eikiew,2020-01-01 16:59:27+00:00,ABC Pants - Sizing,"Hey all,\n\nI recently received ABC pants (siz...",1,6
4,eil4bb,2020-01-01 17:46:20+00:00,Certain Aligns colours with thicker fabric?,Hi lemonheads :D\n\nI was wondering if anyone ...,3,11


In [18]:
# Copy original dataframe

og_lulu_df = lulu_df.copy()

## Preprocessing

In [42]:
# Define stop word list, lemmatizer, and regex

# Stopwords
custom_stop_words = ['lululemon', 'lulu']  # Add any brand boilerplate tokens here
base_stops = set(stopwords.words("english"))
base_stops -= {"no", "nor", "not", "never"}       # Keep negations
stop_words = base_stops.union(custom_stop_words)

lemmatizer = WordNetLemmatizer()

# Precompile regex
_link = re.compile(r'https?://\S+|www\.\S+')
_nonalpha = re.compile(r'[^a-z\s]')
_spaces = re.compile(r'\s+')

In [43]:
# Define text preprocessor

def preprocess(text: str) -> str:

    """
    Preprocess text before modeling
    """
    
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = _link.sub(" ", text)         # Remove links
    text = _nonalpha.sub(" ", text)     # Keep only letters/spaces
    tokens = []
    for t in text.split():
        if t in stop_words or len(t) < 3:
            continue
        t = lemmatizer.lemmatize(t)
        tokens.append(t)
    return _spaces.sub(" ", " ".join(tokens)).strip()

In [44]:
# Apply text preprocessing

lulu_df["clean_text"] = lulu_df["title"].fillna("") + " " + lulu_df["text"].fillna("")
lulu_df["clean_text"] = lulu_df["clean_text"].apply(preprocess)

# drop docs with <5 tokens to reduce noise
lulu_df = lulu_df[lulu_df["clean_text"].str.split().str.len() >= 5].reset_index(drop=True)

In [46]:
# Check some examples

get_text_samples(lulu_df, 'clean_text', 5)

Sample text data:


Text sample 1:


fell gymsharks black friday sale nope soo ordered gymshark set black friday sale curious wanted try got adapt marl seamless legging bra top material itchy squat seam feel look like pulling apart followed size chart wont lie look cute quality not def overhyped imo feel like couple year ago think decent try back anyone else similar experience gymshark anyone actually like would love hear thought



Text sample 2:


store try softstreme pintuck midrise pant bone need good steam excuse wrinkle pick probably comfy importantly versatile back pocket make look like trouser



Text sample 3:


good morning leg day today track high rise short white free wild bra dark forest green



Text sample 4:


scuba hoodie define jacket actual hoodies sweater looking scuba hoodie scuba half zip black debating two like better tend prefer crop leaning towards zip want hear pro con



Text sample 5:


bone nulu set purchased back wmtm bone french terry shrug flow nulu nulu

## Build Dictionary and Corpus

In [47]:
# Collect tokenized docs

tokenized_docs = [doc.split() for doc in lulu_df["clean_text"]]

In [49]:
# Build bigram detector

bigram = Phrases(tokenized_docs, min_count=10, threshold=10)  
bigram_mod = Phraser(bigram)

# Apply the trained model
tokenized_bigrams = [bigram_mod[doc] for doc in tokenized_docs]

In [50]:
# Add bigram text back to dataframe

lulu_df["clean_text_bigram"] = [" ".join(doc) for doc in tokenized_bigrams]

In [51]:
# Check some examples

get_text_samples(lulu_df, 'clean_text_bigram', 5)

Sample text data:


Text sample 1:


faded_zap ebb available dublin stone ridge mall location stopped several faded_zap merlot golden_sand black tidewater_teal ebb call send_sale



Text sample 2:


true trouser crop wondering anyone knew inseam not regular_length cropped one



Text sample 3:


size size aligned_midi dress size size love see belly_button would sizing_help



Text sample 4:


morning run fit swiftly_tech short_sleeve shirt race_length speed



Text sample 5:


anyone_know style top found top social_medium similar beyond_yoga fit included release cannot_find much thanks



