# Text Scraping Lululemon's Subreddit

## Preliminaries

In [41]:
# Imports

from dotenv import load_dotenv
import os
import praw
from praw.models import MoreComments
import matplotlib as plt
import re, pandas as pd
import numpy as np
import time
from datetime import datetime, timezone

In [67]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA"

In [4]:
# Reads .env file into current directory

load_dotenv()

True

In [7]:
# Extract reddit credentials from .env file

reddit = praw.Reddit(client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    username=os.getenv("REDDIT_USERNAME"),
    password=os.getenv("REDDIT_PASSWORD"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
)

In [11]:
# Fix subreddit as lululemon

sub = reddit.subreddit("lululemon")

## Helper functions 

In [14]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [16]:
# Function to get datetime from UTC timestamp

def dt_from_utc(ts: float) -> pd.Timestamp:

    '''
    Returns pd.datetime object (still in UTC)
    '''
    
    return pd.to_datetime(ts, unit="s", utc=True)

In [46]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

## Test Reddit Access

In [8]:
# Test fetching my own username

me = reddit.user.me()
print("Authenticated as:", me)

Authenticated as: lulu_data_collector


In [24]:
# Try collecting 30 most recent posts

rows = []
for i, post in enumerate(sub.new(limit=30)):   # 30 most-recent
    rows.append({
        "post_id": post.id,
        "timestamp": dt_from_utc(post.created_utc),
        "author": str(post.author) if post.author else None,
        "title": clean_text(post.title),
        "text": clean_text(post.selftext) if getattr(post, "selftext", None) else None,
        "score": post.score,
        "num_comments": post.num_comments,
        "link_flair_text": getattr(post, "link_flair_text", None),
        "permalink": f"https://www.reddit.com{post.permalink}",
        "stickied": post.stickied,
        "locked": post.locked,
        "upvote_ratio": getattr(post, "upvote_ratio", None),
    })

posts_df = pd.DataFrame(rows)
print(posts_df.shape)
posts_df.head(3)

(30, 12)


Unnamed: 0,post_id,timestamp,author,title,text,score,num_comments,link_flair_text,permalink,stickied,locked,upvote_ratio
0,1n72lwa,2025-09-03 02:09:03+00:00,CooperDoo422,Mystic 🔮☪️💜,I ordered the Mystic Aligns from WMTM last wee...,33,4,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.95
1,1n70wil,2025-09-03 00:49:54+00:00,painthrowaway852,Autumn Rust combos,"Java, Black, Ivory, and Espresso - love the wa...",100,13,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.96
2,1n70bqt,2025-09-03 00:23:19+00:00,PleaseNoCilantro,Purple or Green,Stuck with a tough choice between purple pacin...,2,8,Styling Advice,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.6


## Collect Post Data

In [42]:
# Function to fetch posts from subreddit

def fetch_posts(subreddit_name=sub, limit:int =500) -> pd.DataFrame:

    """
    Define function to fetch most recent posts from subreddit
    """

    start = time.time()
    
    rows = []
    
    # figure out checkpoint step size (10% increments)
    checkpoint = max(1, limit // 10)

    for i, post in enumerate(sub.new(limit=limit), start=1):
        rows.append({
            "post_id": post.id,
            "timestamp": dt_from_utc(post.created_utc),
            "author": str(post.author) if post.author else None,
            "title": clean_text(post.title),
            "text": clean_text(getattr(post, "selftext", None)),
            "score": post.score,
            "num_comments": post.num_comments,
            "link_flair_text": getattr(post, "link_flair_text", None),
            "permalink": f"https://www.reddit.com{post.permalink}",
            "stickied": post.stickied,
            "locked": post.locked,
            "upvote_ratio": getattr(post, "upvote_ratio", None),
        })

        # print progress every 10%
        if i % checkpoint == 0:
            pct = int(i / limit * 100)
            print(f"... {pct}% ({i}/{limit}) posts scraped")

    print(f"Done! Collected {len(rows)} posts from r/{subreddit_name}")

    end = time.time()
    runtime = (end - start)/60
    print(f"Total runtime was {runtime:.2f} minutes.")
    
    return pd.DataFrame(rows)

In [62]:
# Function to fetch all comments for a given post

def fetch_comments_single_post(post_id: str, max_comments: int | None = None) -> pd.DataFrame:
    
    """
    Fetch all comments for a single post.
    """
    
    submission = reddit.submission(id=post_id)

    submission.comments.replace_more(limit=None)  # expand all MoreComments

    rows = []
    count = 0
    for c in submission.comments.list():
        if isinstance(c, MoreComments):
            continue
        rows.append({
            "post_id": post_id,
            "comment_id": c.id,
            "timestamp": dt_from_utc(c.created_utc),
            "author": str(c.author) if c.author else None,
            "body": clean_text(c.body),
            "score": c.score,
            "is_submitter": getattr(c, "is_submitter", None),
            "parent_id": c.parent_id,   # t1_... (comment) or t3_... (post)
            "permalink": f"https://www.reddit.com{c.permalink}",
            "depth": c.depth,
        })
        count += 1
        if max_comments and count >= max_comments:
            break

    return pd.DataFrame(rows)

In [64]:
# Function to extract single dataframe of all comments

def fetch_comments_from_posts( post_ids: list,
                 max_comments_per_post: int | None = None ) -> pd.DataFrame:
    
    """
    Fetch comments for multiple Reddit posts, provided a list of post ids.
    """
    
    all_dfs = []
    total = len(post_ids)

    for idx, pid in enumerate(post_ids, start=1):
        df = fetch_comments_single_post(pid, max_comments=max_comments_per_post)
        all_dfs.append(df)
        print(f"[{idx}/{total}] Collected {len(df)} comments from post {pid}\n")

    # Concatenate all into a single dataframe
    combined = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
    
    print(f"\nFinished collecting {combined.shape[0]} total comments across {total} posts.")
    return combined

In [44]:
# Fetch posts

posts_df = fetch_posts(limit = 2000)

... 10% (200/2000) posts scraped
... 20% (400/2000) posts scraped
... 30% (600/2000) posts scraped
... 40% (800/2000) posts scraped
Done! Collected 993 posts from r/lululemon
Total runtime was 0.30 minutes.


In [39]:
# Print example post

post = posts_df['text'][0]
print(post)

I ordered the Mystic Aligns from WMTM last week (was shocked to see they were $59!) They arrived Sunday and today it was cool enough to wear them! I paired them with my matching waist length Align Cami under my work shirt, but I also tried them with a couple other items for the sake of some photos too. 🤍HCULG Oversized Full-Zip Scuba Hoodie (xs/s) 🍇Chilled Grape Swiftly Tech Short-Sleeve Shirt Waist Length (6) ☪️Mystic Align Waist-Length Cami Tank Top A/B Cup (8) 🔮Mystic Align HR Pant 25" (8) 🩶Heathered Silver Drop Daily Stride Quarter Socks (M)


In [47]:
# Examine posts dataframe

examine_df('posts dataframe', posts_df)



Number of records in the posts dataframe is: 993


Number of features in the posts dataframe is: 12

The columns in the posts dataframe are: Index(['post_id', 'timestamp', 'author', 'title', 'text', 'score',
       'num_comments', 'link_flair_text', 'permalink', 'stickied', 'locked',
       'upvote_ratio'],
      dtype='object')


 Other info about posts dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993 entries, 0 to 992
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   post_id          993 non-null    object             
 1   timestamp        993 non-null    datetime64[ns, UTC]
 2   author           988 non-null    object             
 3   title            993 non-null    object             
 4   text             993 non-null    object             
 5   score            993 non-null    int64              
 6   num_comments     993 non-null    int64              
 7

None


 Basic statistical info about posts dataframe:



Unnamed: 0,score,num_comments,upvote_ratio
count,993.0,993.0,993.0
mean,70.659617,15.683787,0.854542
std,89.459381,22.484823,0.159645
min,0.0,0.0,0.14
25%,8.0,4.0,0.79
50%,41.0,9.0,0.92
75%,100.0,18.0,0.97
max,1005.0,243.0,1.0




Sample of records in the posts dataframe:


Unnamed: 0,post_id,timestamp,author,title,text,score,num_comments,link_flair_text,permalink,stickied,locked,upvote_ratio
0,1n72lwa,2025-09-03 02:09:03+00:00,CooperDoo422,Mystic 🔮☪️💜,I ordered the Mystic Aligns from WMTM last wee...,50,5,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.96
1,1n70wil,2025-09-03 00:49:54+00:00,painthrowaway852,Autumn Rust combos,"Java, Black, Ivory, and Espresso - love the wa...",140,15,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.97
2,1n70bqt,2025-09-03 00:23:19+00:00,PleaseNoCilantro,Purple or Green,Stuck with a tough choice between purple pacin...,4,8,Styling Advice,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.67
3,1n701dx,2025-09-03 00:10:19+00:00,lameasfuq,Wunder Train Contour Fit... New Fabric?Less co...,I caved for the rich Sequoia color and bought ...,2,2,Product Question/Recommendation,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.67
4,1n6zqss,2025-09-02 23:56:56+00:00,Scrappy52,Chicago/Rosement outlet,Has anyone been to the chicago lulu outlet rec...,1,0,Discussion,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.67


In [52]:
# Extract dataframe of comments for particular post

sample_comments_df = fetch_comments_for_post('1n70wil')

examine_df('sample comment dataframe', sample_comments_df)



Number of records in the sample comment dataframe is: 17


Number of features in the sample comment dataframe is: 10

The columns in the sample comment dataframe are: Index(['post_id', 'comment_id', 'timestamp', 'author', 'body', 'score',
       'is_submitter', 'parent_id', 'permalink', 'depth'],
      dtype='object')


 Other info about sample comment dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       17 non-null     object             
 1   comment_id    17 non-null     object             
 2   timestamp     17 non-null     datetime64[ns, UTC]
 3   author        17 non-null     object             
 4   body          17 non-null     object             
 5   score         17 non-null     int64              
 6   is_submitter  17 non-null     bool               
 7   parent_id     17 non-null   

None


 Basic statistical info about sample comment dataframe:



Unnamed: 0,score,depth
count,17.0,17.0
mean,2.941176,0.411765
std,1.983387,0.5073
min,1.0,0.0
25%,1.0,0.0
50%,2.0,0.0
75%,4.0,1.0
max,8.0,1.0




Sample of records in the sample comment dataframe:


Unnamed: 0,post_id,comment_id,timestamp,author,body,score,is_submitter,parent_id,permalink,depth
0,1n70wil,nc42dzh,2025-09-03 00:49:54+00:00,AutoModerator,Hello! This is a comment to let you know that ...,1,False,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0
1,1n70wil,nc45beg,2025-09-03 01:06:58+00:00,Jimmy_Philly_B-more,"Oh wow, absolutely love how this pairs with Ja...",8,False,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0
2,1n70wil,nc43rfz,2025-09-03 00:57:56+00:00,painthrowaway852,other colors I'm curious to pair with: Sequoia...,5,True,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0
3,1n70wil,nc4nj9l,2025-09-03 02:56:45+00:00,4merly-chicken,It looks like a colour that will pop with true...,5,False,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0
4,1n70wil,nc48ifr,2025-09-03 01:25:43+00:00,SpideyWhiplash,Love your pairings!💯 Always keep me excited ab...,4,False,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0


In [65]:
# Get all comments from all posts

all_post_ids = list(set(posts_df['post_id']))

comments_df = fetch_comments_from_posts(all_post_ids)

[1/993] Collected 53 comments from post 1n4i5ke

[2/993] Collected 15 comments from post 1mw4fhh

[3/993] Collected 6 comments from post 1mfmhqa

[4/993] Collected 4 comments from post 1ma96jj

[5/993] Collected 22 comments from post 1mut4ff

[6/993] Collected 12 comments from post 1mmo081

[7/993] Collected 6 comments from post 1n0zoya

[8/993] Collected 10 comments from post 1mk91sf

[9/993] Collected 25 comments from post 1mj2c5r

[10/993] Collected 24 comments from post 1mdbfpl

[11/993] Collected 7 comments from post 1mpfwrx

[12/993] Collected 82 comments from post 1mjjvwr

[13/993] Collected 3 comments from post 1maorf9

[14/993] Collected 4 comments from post 1n39f9b

[15/993] Collected 10 comments from post 1miav6u

[16/993] Collected 20 comments from post 1ms5e9e

[17/993] Collected 46 comments from post 1moeqvy

[18/993] Collected 21 comments from post 1mogfkp

[19/993] Collected 6 comments from post 1myby39

[20/993] Collected 13 comments from post 1mo4xa4

[21/993] Collect

In [66]:
# Examine comments dataframe

examine_df('comments dataframe', comments_df)



Number of records in the comments dataframe is: 15545


Number of features in the comments dataframe is: 10

The columns in the comments dataframe are: Index(['post_id', 'comment_id', 'timestamp', 'author', 'body', 'score',
       'is_submitter', 'parent_id', 'permalink', 'depth'],
      dtype='object')


 Other info about comments dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15545 entries, 0 to 15544
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       15545 non-null  object             
 1   comment_id    15545 non-null  object             
 2   timestamp     15545 non-null  datetime64[ns, UTC]
 3   author        15362 non-null  object             
 4   body          15545 non-null  object             
 5   score         15545 non-null  int64              
 6   is_submitter  15545 non-null  bool               
 7   parent_id     15545 non-null  object       

None


 Basic statistical info about comments dataframe:



Unnamed: 0,score,depth
count,15545.0,15545.0
mean,4.361016,0.794918
std,13.56825,1.118902
min,-56.0,0.0
25%,1.0,0.0
50%,2.0,0.0
75%,3.0,1.0
max,904.0,9.0




Sample of records in the comments dataframe:


Unnamed: 0,post_id,comment_id,timestamp,author,body,score,is_submitter,parent_id,permalink,depth
0,1n4i5ke,nbl94po,2025-08-31 01:27:30+00:00,AutoModerator,Hello! This is a comment to let you know that ...,1,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0
1,1n4i5ke,nbl9wdc,2025-08-31 01:32:16+00:00,Humble-Bus3726,What kind of dog tho!,63,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0
2,1n4i5ke,nblb2cw,2025-08-31 01:39:23+00:00,thevffice,omg the korok!!! lovelovelove 🥹🥹🥹 where did yo...,47,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0
3,1n4i5ke,nblbqh1,2025-08-31 01:43:32+00:00,Careful_Koala7995,I had a young girl with her friends tell me sh...,47,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0
4,1n4i5ke,nblbpiw,2025-08-31 01:43:22+00:00,Pd_unicorn,Love your outfit too!,10,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0


In [68]:
# Save posts dataframe as a csv

posts_df.to_csv(f"{PATH}/posts.csv", index=False)

In [70]:
# Save comments dataframe as a csv

comments_df.to_csv(f"{PATH}/comments.csv", index=False)