# Visualizing Topic Seasonality

In this notebook, we create visualizations of the topics obtained by our BERTopic model. We examine how topics shift over time and season to season.

## Preliminaries

In [24]:
# Imports

# General

import json
import math
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd

# Time / Grouping
from datetime import datetime, timezone, timedelta
import calendar

# BERTopic (load/use results)
from bertopic import BERTopic

# Plotting
import matplotlib.pyplot as plt             
import plotly.express as px           
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "notebook"            # Set default plotly renderer

# ================== Utils ==================
from itertools import chain
from collections import Counter, defaultdict
import warnings, random
warnings.filterwarnings("ignore")
SEED = 42
random.seed(SEED)
np.random.seed(SEED)


In [2]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA"

## Helper functions 

In [6]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [7]:
# Function to get datetime from UTC timestamp

def dt_from_epoch(ts: Optional[int]):

    """
    Convert timestamp to pd.datetime format
    """

    
    if ts is None:
        return None
    return pd.to_datetime(ts, unit="s", utc=True)

In [8]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

In [9]:
# Function to get sample from text column

def get_text_samples(df: pd.DataFrame, text_col: str, n: int) -> None:

    '''
    Print n samples from a text column in a dataframe
    '''

    # Ensure pandas doesn't truncate text
    pd.set_option('display.max_colwidth', None)
    
    # Sample and print 5 full negative reviews
    print("Sample text data:\n\n")
    sample = df[text_col].sample(n)
    for i, description in enumerate(sample, 1):
        print(f"Text sample {i}:\n\n\n{description}\n\n\n")

In [10]:
# Function for categorical bar graph

def bar_graph(df: pd.DataFrame, col: str) -> None:

    """
    Generate bar graph for categorical column in a dataframe
    """

    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")

    counts = posts_df[col].value_counts()
    
    plt.figure(figsize=(10,6))
    counts.plot(kind="bar")
    plt.title(f"Distribution of {col.title()}")
    plt.xlabel(f"{col.title()}")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

In [11]:
# Define function to plot histogram for numeric columns

def histogram(df: pd.DataFrame, 
             col: str,
            bins: int = 30,
             log: bool = False) -> None:
    
    """
    Generate a histogram for a numeric column in a dataframe.
    """
    
    if col not in df.columns:
        raise ValueError(f"Column '{col}' not found in dataframe")
    
    plt.figure(figsize=(8, 5))
    df[col].dropna().hist(bins=bins, edgecolor="black", log=log)
    plt.title(f"Histogram of {col.title()}")
    plt.xlabel(col.title())
    plt.ylabel("Log(Frequency)" if log else "Frequency")
    plt.tight_layout()
    plt.show()

## Load and Inspect Data

In [13]:
# Load clean data

lulu_df = pd.read_parquet(f"{PATH}/lulu_df_with_topics_00.parquet", engine = 'fastparquet')

In [14]:
# Examine data

examine_df('lulu dataframe', lulu_df)



Number of records in the lulu dataframe is: 56574


Number of features in the lulu dataframe is: 10

The columns in the lulu dataframe are: Index(['post_id', 'timestamp', 'title', 'text', 'score', 'num_comments',
       'clean_text', 'clean_text_bigram', 'topic_00', 'topic_prob_00'],
      dtype='object')


 Other info about lulu dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56574 entries, 0 to 56573
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   post_id            56574 non-null  object             
 1   timestamp          56574 non-null  datetime64[ns, UTC]
 2   title              56574 non-null  object             
 3   text               56574 non-null  object             
 4   score              56574 non-null  int64              
 5   num_comments       56574 non-null  int64              
 6   clean_text         56574 non-null  object             
 7   

None


 Basic statistical info about lulu dataframe:



Unnamed: 0,score,num_comments,topic_00,topic_prob_00
count,56574.0,56574.0,56574.0,56574.0
mean,23.851805,14.901227,2.289903,0.353285
std,88.07185,40.71089,5.006362,0.360782
min,0.0,0.0,-1.0,0.000437
25%,1.0,2.0,-1.0,0.079974
50%,3.0,6.0,-1.0,0.176474
75%,14.0,13.0,4.0,0.553981
max,11864.0,1987.0,18.0,1.0




Sample of records in the lulu dataframe:


Unnamed: 0,post_id,timestamp,title,text,score,num_comments,clean_text,clean_text_bigram,topic_00,topic_prob_00
0,eii06s,2020-01-01 12:46:35+00:00,Major problem falling down leggings?,"Hello, over the last year I have been ordering...",0,6,major problem falling legging hello last year ...,major problem falling legging_hello last_year ...,17,1.0
1,eijtca,2020-01-01 16:00:56+00:00,Tops for yoga,I have a couple swiftly tech racerbacks for ho...,3,4,top yoga couple swiftly tech racerbacks hot yo...,top yoga couple swiftly_tech racerbacks hot_yo...,-1,0.156652
2,eikiew,2020-01-01 16:59:27+00:00,ABC Pants - Sizing,"Hey all,\n\nI recently received ABC pants (siz...",1,6,abc pant sizing hey recently received abc pant...,abc_pant sizing_hey recently_received abc_pant...,-1,0.073371
3,eil4bb,2020-01-01 17:46:20+00:00,Certain Aligns colours with thicker fabric?,Hi lemonheads :D\n\nI was wondering if anyone ...,3,11,certain aligns colour thicker fabric lemonhead...,certain aligns_colour thicker_fabric lemonhead...,-1,0.051949
4,ein4er,2020-01-01 20:16:05+00:00,Mens Commission Pant Slim,Got a giftcard for lulu because my family know...,3,5,men commission pant slim giftcard family know ...,men_commission pant_slim giftcard family know_...,-1,0.035935


In [15]:
# Copy original dataframe

og_lulu_df = lulu_df.copy()

## Cleaning to Visualize

In [124]:
# Drop unnecessary columns for visualization

irr_columns = ['post_id', 'title', 'text', 'score', 'num_comments',
       'clean_text', 'clean_text_bigram', 'topic_prob_00']

vis_df = lulu_df.drop(columns = irr_columns)

In [125]:
# Rename topic column

vis_df = vis_df.rename(columns={'topic_00': 'topic'})

In [81]:
# Examine vis_df

examine_df('',vis_df)



Number of records in the  is: 22370


Number of features in the  is: 2

The columns in the  are: Index(['timestamp', 'topic'], dtype='object')


 Other info about :

<class 'pandas.core.frame.DataFrame'>
Index: 22370 entries, 0 to 56573
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   timestamp  22370 non-null  datetime64[ns, UTC]
 1   topic      22370 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(1)
memory usage: 524.3 KB


None


 Basic statistical info about :



Unnamed: 0,topic
count,22370.0
mean,5.776039
std,5.232944
min,0.0
25%,2.0
50%,4.0
75%,9.0
max,18.0




Sample of records in the :


Unnamed: 0,timestamp,topic
0,2020-01-01 12:46:35+00:00,17
11,2020-01-02 18:39:05+00:00,17
12,2020-01-02 19:24:16+00:00,10
13,2020-01-02 23:45:09+00:00,0
15,2020-01-03 04:31:33+00:00,6


In [134]:
# Create topic labels

topic_labels = {
    0: "Shorts/Leggings",
    1: "Orders/Returns",
    2: "Colors",
    3: "Hoodies/Fleeces",
    4: "Accessories",
    5: "Joggers/Pants",
    6: "Sports Bras",
    7: "Sales/Events",
    8: "Laundry",
    9: "T-Shirts",
    10: "Tanks",
    11: "Discounts/Jobs",
    12: "Community",
    13: "Define Jackets",
    14: "Outerwear",
    15: "Hemming",
    16: "Yoga/Athletic Wear",
    17: "Underwear",
    18: "Skirts/Dresses"
}


In [133]:
# Define mapping of topic ids to merged categories

merge_map = {
    # # Orders/Returns (1) + Sales/Events (7) → "Orders/Sales"
    # 1: "Sales/Events",
    # 7: "Sales/Events",

    # Define Jackets (13) + Outerwear (14) → "Outerwear/Define"
    13: "Outerwear",
    14: "Outerwear",

    16: "Yoga/Gym Wear",
    18: "Yoga/Gym Wear",
    9: "Yoga/Gym Wear",

    # Tanks (10) + Sports Bras (6) → "Tanks/Sports Bras"
    10: "Tanks/Sports Bras",
    6: "Tanks/Sports Bras",
}

In [136]:
# Apply merges and drops, keeping original labels for others

vis_df = lulu_df.drop(columns = irr_columns)
vis_df = vis_df.rename(columns={'topic_00': 'topic'})

exc_topics = [-1, 8, 11, 12, 15, 17]

vis_df = vis_df[~vis_df['topic'].isin(exc_topics)]

vis_df["topic_merged"] = vis_df["topic"].map(lambda x: merge_map.get(x, topic_labels[x]))

## Visualization Functions

In [1]:
# Function to plot topic counts seasonally

def plot_topic_counts(df, topic_col="topic_merged", season=None, top_n=6):
    """
    Plot topic frequencies with distinct categorical colors (no legend),
    sorted by frequency (desc), optionally filtered by season,
    with option for top N only, and seasonal emojis in the title.
    """
    # --- Season filter + emoji mapping ---
    emoji_map = {
        "winter": "❄️❄️❄️",
        "spring": "🌸🌸🌸",
        "summer": "☀️☀️☀️",
        "fall": "🍂🍂🍂",
        "autumn": "🍂"
    }

    if season is not None:
        s = str(season).strip().lower()
        if s == "winter":
            months = [12, 1, 2]
        elif s == "spring":
            months = [3, 4, 5]
        elif s == "summer":
            months = [6, 7, 8]
        elif s in ("fall", "autumn"):
            months = [9, 10, 11]
        else:
            raise ValueError("season must be one of {'winter','spring','summer','fall'} or None")
        df_plot = df[df["timestamp"].dt.month.isin(months)]
        season_title = f"{s.capitalize()} {emoji_map[s]}"
        title_suffix = f" — {season_title}"
    else:
        df_plot = df
        title_suffix = ""

    # --- Counts (largest → smallest) ---
    topic_counts = (
        df_plot[topic_col]
        .value_counts()
        .sort_values(ascending=False)
        .rename_axis(topic_col)
        .reset_index(name="count")
    )

    if top_n is not None:
        topic_counts = topic_counts.head(top_n)

    # --- Stable distinct colors tied to merged labels ---
    cats_all = df[topic_col].dropna().astype(str).unique()
    cats_all = sorted(cats_all)
    palette = px.colors.qualitative.Dark24
    color_map = {cat: palette[i % len(palette)] for i, cat in enumerate(cats_all)}

    fig = px.bar(
        topic_counts,
        x=topic_col,
        y="count",
        color=topic_col,
        color_discrete_map=color_map,
        title=f"Topic Frequency{title_suffix}",
        labels={topic_col: "Topic", "count": "Frequency"},
    )

    fig.update_layout(
        showlegend=False,
        xaxis_title="Topic",
        yaxis_title=None
    )
    fig.update_yaxes(showticklabels=False)
    fig.show()

## Topic Bar Charts

In [168]:
# Plot topic counts

plot_topic_counts(vis_df)

## Spring

In [169]:
# Plot topic counts for spring

plot_topic_counts(vis_df, season = 'spring')

## Summer

In [170]:
# Plot topic counts for spring

plot_topic_counts(vis_df, season = 'summer')

## Fall

In [171]:
# Plot topic counts for spring

plot_topic_counts(vis_df, season = 'fall')

## Winter

In [172]:
# Plot topic counts for spring

plot_topic_counts(vis_df, season = 'winter')

## Topic Bar Chart Animation

In [176]:
# Function to create animation

def animate_topics_over_time(df, topic_col="topic_merged", top_n=None):
    """
    Create a month-by-month animated bar chart of topic frequencies.
    """
    # --- Prepare monthly buckets ---
    monthly = (
        df.assign(month=df["timestamp"].dt.to_period("M").dt.to_timestamp())
          .groupby(["month", topic_col], as_index=False)
          .size()
          .rename(columns={"size": "count"})
    )

    # --- Optionally restrict to top-N topics overall (stable across frames) ---
    if top_n is not None:
        top_topics = (
            monthly.groupby(topic_col)["count"]
                   .sum()
                   .sort_values(ascending=False)
                   .head(top_n)
                   .index
        )
        monthly = monthly[monthly[topic_col].isin(top_topics)]

    # --- Stable category order (by overall frequency, desc) for consistent x ordering ---
    overall_order = (
        monthly.groupby(topic_col)["count"]
               .sum()
               .sort_values(ascending=False)
               .index.tolist()
    )

    # --- Distinct color map (non-spectral), tied to topic labels ---
    cats_all = sorted(monthly[topic_col].dropna().astype(str).unique())
    palette = px.colors.qualitative.Dark24  # distinct colors
    color_map = {cat: palette[i % len(palette)] for i, cat in enumerate(cats_all)}

    # --- Fix y-range across frames so bars don't "jump" visually ---
    y_max = (monthly["count"].max() or 1) * 1.05

    # --- Build animation ---
    # Use formatted month strings for cleaner slider labels
    monthly["month_label"] = monthly["month"].dt.strftime("%Y-%m")

    fig = px.bar(
        monthly,
        x=topic_col,
        y="count",
        color=topic_col,
        animation_frame="month_label",
        category_orders={topic_col: overall_order},
        color_discrete_map=color_map,
        title="Topic Frequency — Monthly Animation ❄️🌸☀️🍂",
        labels={topic_col: "Topic", "count": "Frequency"},
        range_y=[0, y_max],
    )

    # Tidy aesthetics: no legend, hide y-axis tick labels (relative emphasis)
    fig.update_layout(
        showlegend=False,
        xaxis_title="Topic",
        yaxis_title=None,
        margin=dict(l=20, r=20, t=60, b=20)
    )
    fig.update_yaxes(showticklabels=False)

    # Smoother animation settings
    fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 600  # ms per frame
    fig.layout.updatemenus[0].buttons[0].args[1]["transition"]["duration"] = 300

    fig.show()

In [177]:
# Animate all topics month-by-month

animate_topics_over_time(vis_df)