# Text Scraping Lululemon's Subreddit

## Preliminaries

In [28]:
# Imports

from dotenv import load_dotenv
import os
import praw
import matplotlib as plt
import re, pandas as pd
import numpy as np
import time
from datetime import datetime, timezone

In [4]:
# Reads .env file into current directory

load_dotenv()

True

In [7]:
# Extract reddit credentials from .env file

reddit = praw.Reddit(client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    username=os.getenv("REDDIT_USERNAME"),
    password=os.getenv("REDDIT_PASSWORD"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
)

In [11]:
# Fix subreddit as lululemon

sub = reddit.subreddit("lululemon")

## Helper functions 

In [14]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [16]:
# Function to get datetime from UTC timestamp

def dt_from_utc(ts: float) -> pd.Timestamp:

    '''
    Returns pd.datetime object (still in UTC)
    '''
    
    return pd.to_datetime(ts, unit="s", utc=True)

## Test Reddit Access

In [8]:
# Test fetching my own username

me = reddit.user.me()
print("Authenticated as:", me)

Authenticated as: lulu_data_collector


In [24]:
# Try collecting 30 most recent posts

rows = []
for i, post in enumerate(sub.new(limit=30)):   # 30 most-recent
    rows.append({
        "post_id": post.id,
        "timestamp": dt_from_utc(post.created_utc),
        "author": str(post.author) if post.author else None,
        "title": clean_text(post.title),
        "text": clean_text(post.selftext) if getattr(post, "selftext", None) else None,
        "score": post.score,
        "num_comments": post.num_comments,
        "link_flair_text": getattr(post, "link_flair_text", None),
        "permalink": f"https://www.reddit.com{post.permalink}",
        "stickied": post.stickied,
        "locked": post.locked,
        "upvote_ratio": getattr(post, "upvote_ratio", None),
    })

posts_df = pd.DataFrame(rows)
print(posts_df.shape)
posts_df.head(3)

(30, 12)


Unnamed: 0,post_id,timestamp,author,title,text,score,num_comments,link_flair_text,permalink,stickied,locked,upvote_ratio
0,1n72lwa,2025-09-03 02:09:03+00:00,CooperDoo422,Mystic 🔮☪️💜,I ordered the Mystic Aligns from WMTM last wee...,33,4,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.95
1,1n70wil,2025-09-03 00:49:54+00:00,painthrowaway852,Autumn Rust combos,"Java, Black, Ivory, and Espresso - love the wa...",100,13,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.96
2,1n70bqt,2025-09-03 00:23:19+00:00,PleaseNoCilantro,Purple or Green,Stuck with a tough choice between purple pacin...,2,8,Styling Advice,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.6


## Collect Post Data

In [32]:
# Define function to fetch

def fetch_posts(subreddit_name=sub, limit=500):

    """
    Define function to fetch most recent posts from subreddit
    """

    start = time.time()
    
    rows = []
    
    # figure out checkpoint step size (10% increments)
    checkpoint = max(1, limit // 10)

    for i, post in enumerate(sub.new(limit=limit), start=1):
        rows.append({
            "post_id": post.id,
            "timestamp": dt_from_utc(post.created_utc),
            "author": str(post.author) if post.author else None,
            "title": clean_text(post.title),
            "text": clean_text(getattr(post, "selftext", None)),
            "score": post.score,
            "num_comments": post.num_comments,
            "link_flair_text": getattr(post, "link_flair_text", None),
            "permalink": f"https://www.reddit.com{post.permalink}",
            "stickied": post.stickied,
            "locked": post.locked,
            "upvote_ratio": getattr(post, "upvote_ratio", None),
        })

        # print progress every 10%
        if i % checkpoint == 0:
            pct = int(i / limit * 100)
            print(f"... {pct}% ({i}/{limit}) posts scraped")

    print(f"Done! Collected {len(rows)} posts from r/{subreddit_name}")

    end = time.time()
    runtime = (end - start)/60
    print(f"Total runtime was {runtime:.2f} minutes.")
    
    return pd.DataFrame(rows)

In [35]:
# Fetch posts
start = time.time()
posts_df = fetch_posts(limit = 2000)
end = time.time()

runtime = (end - start)/60

print(f"Total runtime was {runtime:.2f} minutes")

... 10% (200/2000) posts scraped
... 20% (400/2000) posts scraped
... 30% (600/2000) posts scraped
... 40% (800/2000) posts scraped
Done! Collected 994 posts from r/lululemon
Total runtime was 0.30 minutes.
Total runtime was 0.30 minutes


In [37]:
post = posts_df['text'][0]
print(post)

I ordered the Mystic Aligns from WMTM last week (was shocked to see they were $59!) They arrived Sunday and today it was cool enough to wear them! I paired them with my matching waist length Align Cami under my work shirt, but I also tried them with a couple other items for the sake of some photos too. 🤍HCULG Oversized Full-Zip Scuba Hoodie (xs/s) 🍇Chilled Grape Swiftly Tech Short-Sleeve Shirt Waist Length (6) ☪️Mystic Align Waist-Length Cami Tank Top A/B Cup (8) 🔮Mystic Align HR Pant 25" (8) 🩶Heathered Silver Drop Daily Stride Quarter Socks (M)


In [38]:
posts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   post_id          994 non-null    object             
 1   timestamp        994 non-null    datetime64[ns, UTC]
 2   author           989 non-null    object             
 3   title            994 non-null    object             
 4   text             994 non-null    object             
 5   score            994 non-null    int64              
 6   num_comments     994 non-null    int64              
 7   link_flair_text  994 non-null    object             
 8   permalink        994 non-null    object             
 9   stickied         994 non-null    bool               
 10  locked           994 non-null    bool               
 11  upvote_ratio     994 non-null    float64            
dtypes: bool(2), datetime64[ns, UTC](1), float64(1), int64(2), object(6)
memory usa