# Text Scraping Lululemon's Subreddit

## Preliminaries

In [42]:
# Imports

from dotenv import load_dotenv
import os
import praw
from praw.models import MoreComments
import matplotlib as plt
import re, pandas as pd
import numpy as np
import time
from datetime import datetime, timezone, timedelta

In [43]:
# Set user's data path

PATH = f"C:/Users/emshe/Desktop/BRAINSTATION/LULULEMON/DATA"

In [44]:
# Reads .env file into current directory

load_dotenv()

True

In [45]:
# Extract reddit credentials from .env file

reddit = praw.Reddit(client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    username=os.getenv("REDDIT_USERNAME"),
    password=os.getenv("REDDIT_PASSWORD"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
)

In [46]:
# Fix subreddit as lululemon

sub = reddit.subreddit("lululemon")

## Helper functions 

In [47]:
# Function to clean text

def clean_text(s: str | None) -> str | None:
    
    '''
    Clean string by substituting spaces for problematic characters
    '''
    
    if s is None:
        return None
    s = re.sub(r"\s+", " ", s).strip()
    return s

In [48]:
# Function to get datetime from UTC timestamp

def dt_from_utc(ts: float) -> pd.Timestamp:

    '''
    Returns pd.datetime object (still in UTC)
    '''
    
    return pd.to_datetime(ts, unit="s", utc=True)

In [49]:
# Function to examine dataframes

def examine_df(name,df,
               include_stats = True,
               include_sample = True):
    
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"\nNumber of features in the {name} is: {len(df.columns)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:\n")
    display(df.info())
    if include_stats == True:
        print(f'\n Basic statistical info about {name}:\n')
        display(df.describe())
    if include_sample == True:
        print(f"\n\nSample of records in the {name}:")
        display(df.head(5))

## Test Reddit Access

In [8]:
# Test fetching my own username

me = reddit.user.me()
print("Authenticated as:", me)

Authenticated as: lulu_data_collector


In [24]:
# Try collecting 30 most recent posts

rows = []
for i, post in enumerate(sub.new(limit=30)):   # 30 most-recent
    rows.append({
        "post_id": post.id,
        "timestamp": dt_from_utc(post.created_utc),
        "author": str(post.author) if post.author else None,
        "title": clean_text(post.title),
        "text": clean_text(post.selftext) if getattr(post, "selftext", None) else None,
        "score": post.score,
        "num_comments": post.num_comments,
        "link_flair_text": getattr(post, "link_flair_text", None),
        "permalink": f"https://www.reddit.com{post.permalink}",
        "stickied": post.stickied,
        "locked": post.locked,
        "upvote_ratio": getattr(post, "upvote_ratio", None),
    })

posts_df = pd.DataFrame(rows)
print(posts_df.shape)
posts_df.head(3)

(30, 12)


Unnamed: 0,post_id,timestamp,author,title,text,score,num_comments,link_flair_text,permalink,stickied,locked,upvote_ratio
0,1n72lwa,2025-09-03 02:09:03+00:00,CooperDoo422,Mystic 🔮☪️💜,I ordered the Mystic Aligns from WMTM last wee...,33,4,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.95
1,1n70wil,2025-09-03 00:49:54+00:00,painthrowaway852,Autumn Rust combos,"Java, Black, Ivory, and Espresso - love the wa...",100,13,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.96
2,1n70bqt,2025-09-03 00:23:19+00:00,PleaseNoCilantro,Purple or Green,Stuck with a tough choice between purple pacin...,2,8,Styling Advice,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.6


## Collect Post Data

### Functions

In [50]:
# Function to fetch posts from subreddit

def fetch_posts(subreddit_name = 'lululemon', 
                     limit:int = 500) -> pd.DataFrame:

    """
    Define function to fetch most recent posts from subreddit
    """

    start = time.time()
    
    rows = []
    
    # figure out checkpoint step size (10% increments)
    checkpoint = max(1, limit // 10)

    for i, post in enumerate(sub.new(limit=limit), start=1):
        rows.append({
            "post_id": post.id,
            "timestamp": dt_from_utc(post.created_utc),
            "author": str(post.author) if post.author else None,
            "title": clean_text(post.title),
            "text": clean_text(getattr(post, "selftext", None)),
            "score": post.score,
            "num_comments": post.num_comments,
            "link_flair_text": getattr(post, "link_flair_text", None),
            "permalink": f"https://www.reddit.com{post.permalink}",
            "stickied": post.stickied,
            "locked": post.locked,
            "upvote_ratio": getattr(post, "upvote_ratio", None),
        })

        # print progress every 10%
        if i % checkpoint == 0:
            pct = int(i / limit * 100)
            print(f"... {pct}% ({i}/{limit}) posts scraped")

    print(f"Done! Collected {len(rows)} posts from r/{subreddit_name}")

    end = time.time()
    runtime = (end - start)/60
    print(f"Total runtime was {runtime:.2f} minutes.")
    
    return pd.DataFrame(rows)

In [86]:
# Function to extract older posts from subreddit

def fetch_from_time_range(subreddit_name: str = "lululemon", 
                           limit: int = 1000,
                      time_filter: str = "all",  # "all", "year", "month", "week", "day"
            start_dt: datetime | None = None,
              end_dt: datetime | None = None,
                        verbose: bool = True) -> pd.DataFrame:
    
    '''
    Extract 1000 posts from given timeframe
    '''
    
    sub = reddit.subreddit(subreddit_name)
    rows = []

    for i, post in enumerate(sub.top(time_filter=time_filter, limit=limit), start=1):
        ts = dt_from_utc(post.created_utc)
        rows.append({
            "post_id": post.id,
            "timestamp": ts,
            "author": str(post.author) if post.author else None,
            "title": clean_text(post.title),
            "text": clean_text(getattr(post, "selftext", None)),
            "score": post.score,
            "num_comments": post.num_comments,
            "permalink": f"https://www.reddit.com{post.permalink}",
            "link_flair_text": getattr(post, "link_flair_text", None),
        })
        if verbose and i % 100 == 0:
            print(f"... {i} posts scraped")

    df = pd.DataFrame(rows)

    # Manual filter by datetime range
    if start_dt:
        df = df[df["timestamp"] >= start_dt]
    if end_dt:
        df = df[df["timestamp"] <= end_dt]

    df = df.sort_values("timestamp", ascending=False).reset_index(drop=True)
    if verbose:
        if not df.empty:
            min_ts = df["timestamp"].min().strftime("%Y-%m-%d")
            max_ts = df["timestamp"].max().strftime("%Y-%m-%d")
            print(f"Finished collecting {len(df)} posts from r/{subreddit_name} "
                  f"covering {min_ts} to {max_ts}")
        else:
            print(f"No posts found for the given filters.")

    return df

In [92]:
# Function to set up month spans

def month_spans(start_year: int = 2020, 
               start_month: int = 1) -> list[tuple[datetime, datetime]]:
    
    """
    Yield (start_dt, end_dt) UTC for each calendar month from start_month to current month.
    """
    
    spans = []
    now = datetime.now(timezone.utc)
    y, m = start_year, start_month
    while (y < now.year) or (y == now.year and m <= now.month):
        start_dt = datetime(y, m, 1, tzinfo=timezone.utc)
        
        # next month
        if m == 12:
            ny, nm = y + 1, 1
        else:
            ny, nm = y, m + 1
        next_month_start = datetime(ny, nm, 1, tzinfo=timezone.utc)
        end_dt = min(next_month_start - timedelta(seconds=1), now)  # don’t go beyond "now"
        spans.append((start_dt, end_dt))
        y, m = ny, nm
    return spans

In [96]:
# Function to scrape month by month

def fetch_top_by_months(
    subreddit_name: str = "lululemon",
    start_year: int = 2020,
    start_month: int = 1,
    per_month_limit: int = 1000,
    sleep_seconds: float = 1.0,
    verbose: bool = True
) -> pd.DataFrame:
    
    """
    Fetch top posts month by month and merge into a single dataframe.
    """
    
    spans = month_spans(start_year, start_month)
    all_month_dfs: list[pd.DataFrame] = []

    for i, (start_dt, end_dt) in enumerate(spans, start=1):
        df_month = fetch_from_time_range(
            subreddit_name=subreddit_name,
            limit=per_month_limit,
            time_filter="all",
            start_dt=start_dt,
            end_dt=end_dt,
            verbose=False
        )
        if verbose:
            print(f"[{i}/{len(spans)}] {start_dt.strftime('%Y-%m')} → {len(df_month)} posts")
        all_month_dfs.append(df_month)
        time.sleep(sleep_seconds)

    if all_month_dfs:
        combined = (
            pd.concat(all_month_dfs, ignore_index=True)
              .drop_duplicates(subset="post_id")
              .sort_values("timestamp", ascending=False)
              .reset_index(drop=True)
        )
    else:
        combined = pd.DataFrame()

    if verbose and not combined.empty:
        min_ts = combined["timestamp"].min().strftime("%Y-%m-%d")
        max_ts = combined["timestamp"].max().strftime("%Y-%m-%d")
        print(f"\nCombined: {combined.shape[0]} unique posts covering {min_ts} → {max_ts}")

    return combined

In [52]:
# Function to fetch all comments for a given post

def fetch_comments_single_post(post_id: str, max_comments: int | None = None) -> pd.DataFrame:
    
    """
    Fetch all comments for a single post.
    """
    
    submission = reddit.submission(id=post_id)

    submission.comments.replace_more(limit=None)  # expand all MoreComments

    rows = []
    count = 0
    for c in submission.comments.list():
        if isinstance(c, MoreComments):
            continue
        rows.append({
            "post_id": post_id,
            "comment_id": c.id,
            "timestamp": dt_from_utc(c.created_utc),
            "author": str(c.author) if c.author else None,
            "body": clean_text(c.body),
            "score": c.score,
            "is_submitter": getattr(c, "is_submitter", None),
            "parent_id": c.parent_id,   # t1_... (comment) or t3_... (post)
            "permalink": f"https://www.reddit.com{c.permalink}",
            "depth": c.depth,
        })
        count += 1
        if max_comments and count >= max_comments:
            break

    return pd.DataFrame(rows)

In [53]:
# Function to extract single dataframe of all comments

def fetch_comments_from_posts( post_ids: list,
                 max_comments_per_post: int | None = None ) -> pd.DataFrame:
    
    """
    Fetch comments for multiple Reddit posts, provided a list of post ids.
    """
    
    all_dfs = []
    total = len(post_ids)

    for idx, pid in enumerate(post_ids, start=1):
        df = fetch_comments_single_post(pid, max_comments=max_comments_per_post)
        all_dfs.append(df)
        print(f"[{idx}/{total}] Collected {len(df)} comments from post {pid}\n")

    # Concatenate all into a single dataframe
    combined = pd.concat(all_dfs, ignore_index=True) if all_dfs else pd.DataFrame()
    
    print(f"\nFinished collecting {combined.shape[0]} total comments across {total} posts.")
    return combined

### Collect new posts and comments

In [17]:
# Fetch posts

new_posts_df = fetch_posts(limit = 2000)

... 10% (200/2000) posts scraped
... 20% (400/2000) posts scraped
... 30% (600/2000) posts scraped
... 40% (800/2000) posts scraped
Done! Collected 995 posts from r/lululemon
Total runtime was 0.32 minutes.


In [18]:
# Print example post

post = new_posts_df['text'][0]
print(post)

I was shopping for a new skirt as a treat-yo-self and I saw the new Side-Pleat High-Rise Tennis Skirt. The red one priced at $98 but as it was a TYS purchase I decided to get it along with 3 other items. I got a notice that my item was shipped but awaiting carrier pick-up but that lasted so many days which is unusual for lulu. I contacted Lululemon to inquire and they said it was lost. They would refund me but seeing that the price has increased by then dollars for the skirt, I asked if they would just replace the order. The CSR said they need 100 items in order for a replacement to happen but it's available online. After some arm twisting, she put the order threw but told me that there was a good chance my order would get cancelled. I shop a lot at Lululemon and lately I feel like the customer service has been going way down. We pay a lot of money for the clothes and it just left a bad taste that I had to so much coaxing to get a reorder on a lost shipment because they use a bad carri

In [19]:
# Examine posts dataframe

examine_df('new posts dataframe', new_posts_df)



Number of records in the new posts dataframe is: 995


Number of features in the new posts dataframe is: 12

The columns in the new posts dataframe are: Index(['post_id', 'timestamp', 'author', 'title', 'text', 'score',
       'num_comments', 'link_flair_text', 'permalink', 'stickied', 'locked',
       'upvote_ratio'],
      dtype='object')


 Other info about new posts dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   post_id          995 non-null    object             
 1   timestamp        995 non-null    datetime64[ns, UTC]
 2   author           990 non-null    object             
 3   title            995 non-null    object             
 4   text             995 non-null    object             
 5   score            995 non-null    int64              
 6   num_comments     995 non-null    int64 

None


 Basic statistical info about new posts dataframe:



Unnamed: 0,score,num_comments,upvote_ratio
count,995.0,995.0,995.0
mean,70.708543,15.688442,0.853698
std,89.748091,22.499797,0.160594
min,0.0,0.0,0.13
25%,7.0,4.0,0.79
50%,40.0,9.0,0.92
75%,100.0,18.0,0.97
max,1008.0,243.0,1.0




Sample of records in the new posts dataframe:


Unnamed: 0,post_id,timestamp,author,title,text,score,num_comments,link_flair_text,permalink,stickied,locked,upvote_ratio
0,1n7q2s8,2025-09-03 20:31:13+00:00,berry_pink,Price change in a few days and bad CSR,I was shopping for a new skirt as a treat-yo-s...,2,0,Discussion,https://www.reddit.com/r/lululemon/comments/1n...,False,False,1.0
1,1n7po2x,2025-09-03 20:16:03+00:00,runandplay2,In Store Try On (Rockwood / New define Track t...,Sorry for the double post today but thought ot...,14,1,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,1.0
2,1n7nh4p,2025-09-03 18:53:26+00:00,Puzzleheaded-Fan5586,Which bag for travel?,Looking for a safe crossbody bag for a Europea...,2,2,Product Question/Recommendation,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.75
3,1n7n9mb,2025-09-03 18:45:38+00:00,Ameeeekay,OOTD 🤍🖤,Fast and Free High-Rise Classic-Fit Split Shor...,29,1,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.89
4,1n7n4e1,2025-09-03 18:40:03+00:00,Apprehensive_Place51,insulated os collared jacket sizing,could anyone give me sizing advice on this? i'...,1,4,Sizing Advice,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.66


### Collect Older Posts (Attempt)

In [88]:
# Test collecting posts from 2021

posts_df_2021 = fetch_from_time_range(
     start_dt = datetime(2021,1,1, tzinfo=timezone.utc),
       end_dt = datetime(2021,12,31,23,59, tzinfo=timezone.utc),
      verbose = True
)

examine_df('older posts dataframe', posts_df)

... 100 posts scraped
... 200 posts scraped
... 300 posts scraped
... 400 posts scraped
... 500 posts scraped
... 600 posts scraped
... 700 posts scraped
... 800 posts scraped
... 900 posts scraped
Finished collecting 212 posts from r/lululemon covering 2021-01-01 to 2021-12-26


Number of records in the older posts dataframe is: 999


Number of features in the older posts dataframe is: 9

The columns in the older posts dataframe are: Index(['post_id', 'timestamp', 'author', 'title', 'text', 'score',
       'num_comments', 'permalink', 'link_flair_text'],
      dtype='object')


 Other info about older posts dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   post_id          999 non-null    object             
 1   timestamp        999 non-null    datetime64[ns, UTC]
 2   author           919 non-null  

None


 Basic statistical info about older posts dataframe:



Unnamed: 0,score,num_comments
count,999.0,999.0
mean,796.542543,62.826827
std,1627.504503,60.98726
min,451.0,2.0
25%,510.0,31.0
50%,602.0,46.0
75%,782.5,74.0
max,49038.0,1002.0




Sample of records in the older posts dataframe:


Unnamed: 0,post_id,timestamp,author,title,text,score,num_comments,permalink,link_flair_text
0,g6dpgp,2020-04-23 01:23:37+00:00,MichelleT88,So these came today. I'm tranfeminine and have...,,484,82,https://www.reddit.com/r/lululemon/comments/g6...,Fit Pics
1,g7k04x,2020-04-25 00:27:06+00:00,crystalk_13,couldn’t relate more 😂,,461,19,https://www.reddit.com/r/lululemon/comments/g7...,Misc
2,giw8ut,2020-05-13 10:11:41+00:00,einaeb1,On Wednesdays we wear pink (and purple?),,482,37,https://www.reddit.com/r/lululemon/comments/gi...,Collection
3,gk10a9,2020-05-15 02:50:13+00:00,isappleworthit,25% Off Lululemon Link,**Expires May 21 (11:59 PST)** **[Here is am t...,948,1002,https://www.reddit.com/r/lululemon/comments/gk...,Discussion
4,go77dy,2020-05-21 22:51:17+00:00,minidontsurf,Daydream collection! Still trying to snag the ...,,476,43,https://www.reddit.com/r/lululemon/comments/go...,Collection


In [98]:
# Collect month by month from 2020 till the present

posts_df = fetch_top_by_months(start_year=2022, start_month=1)

examine_df('all posts dataframe', posts_df)

[1/45] 2022-01 → 8 posts
[2/45] 2022-02 → 11 posts
[3/45] 2022-03 → 9 posts
[4/45] 2022-04 → 11 posts
[5/45] 2022-05 → 11 posts
[6/45] 2022-06 → 12 posts
[7/45] 2022-07 → 11 posts
[8/45] 2022-08 → 6 posts
[9/45] 2022-09 → 13 posts
[10/45] 2022-10 → 17 posts
[11/45] 2022-11 → 10 posts
[12/45] 2022-12 → 12 posts
[13/45] 2023-01 → 14 posts
[14/45] 2023-02 → 6 posts
[15/45] 2023-03 → 9 posts
[16/45] 2023-04 → 14 posts
[17/45] 2023-05 → 9 posts
[18/45] 2023-06 → 10 posts
[19/45] 2023-07 → 5 posts
[20/45] 2023-08 → 1 posts
[21/45] 2023-09 → 5 posts
[22/45] 2023-10 → 7 posts
[23/45] 2023-11 → 11 posts
[24/45] 2023-12 → 9 posts
[25/45] 2024-01 → 13 posts
[26/45] 2024-02 → 7 posts
[27/45] 2024-03 → 11 posts
[28/45] 2024-04 → 10 posts
[29/45] 2024-05 → 10 posts
[30/45] 2024-06 → 8 posts
[31/45] 2024-07 → 14 posts
[32/45] 2024-08 → 22 posts
[33/45] 2024-09 → 40 posts
[34/45] 2024-10 → 65 posts
[35/45] 2024-11 → 73 posts
[36/45] 2024-12 → 11 posts
[37/45] 2025-01 → 13 posts
[38/45] 2025-02 → 15 po

None


 Basic statistical info about all posts dataframe:



Unnamed: 0,score,num_comments
count,603.0,603.0
mean,866.354892,69.308458
std,2072.285649,62.169373
min,450.0,2.0
25%,513.5,31.0
50%,615.0,47.0
75%,840.5,84.0
max,49047.0,438.0




Sample of records in the all posts dataframe:


Unnamed: 0,post_id,timestamp,author,title,text,score,num_comments,permalink,link_flair_text
0,1n4i5ke,2025-08-31 01:27:26+00:00,burpling,A girl in a group of teenagers (scary) told me...,I was wearing this admittedly funny mix with a...,728,53,https://www.reddit.com/r/lululemon/comments/1n...,Fit Pics
1,1mxsau6,2025-08-23 04:46:22+00:00,taztazz895,First post. Kinda nervous 😆,Fell in love with goodnight plum 💟,528,38,https://www.reddit.com/r/lululemon/comments/1m...,Fit Pics
2,1mlbcrx,2025-08-09 00:10:54+00:00,tinanguyenn,i can’t be the only one 🤪,finally did laundry after 2.5 weeks of workout...,622,113,https://www.reddit.com/r/lululemon/comments/1m...,Collection
3,1mjjvwr,2025-08-06 23:13:34+00:00,Alex_daisy13,What in Gilead is this thing???,"""Blessed be the fruit"" I guess.",1005,83,https://www.reddit.com/r/lululemon/comments/1m...,Discussion
4,1mcogbm,2025-07-29 21:41:25+00:00,xoghostbaby,fog green matching set :),i’m wearing swiftly tech cropped short sleeve ...,621,8,https://www.reddit.com/r/lululemon/comments/1m...,Fit Pics


In [99]:
# Merge with the new posts previously extracted

posts_df = (
    pd.concat([new_posts_df, posts_df], ignore_index=True)
      .drop_duplicates(subset="post_id")
      .sort_values("timestamp", ascending=False)
      .reset_index(drop=True)
)

examine_df('all posts', posts_df)



Number of records in the all posts is: 1593


Number of features in the all posts is: 12

The columns in the all posts are: Index(['post_id', 'timestamp', 'author', 'title', 'text', 'score',
       'num_comments', 'link_flair_text', 'permalink', 'stickied', 'locked',
       'upvote_ratio'],
      dtype='object')


 Other info about all posts:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1593 entries, 0 to 1592
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   post_id          1593 non-null   object             
 1   timestamp        1593 non-null   datetime64[ns, UTC]
 2   author           1530 non-null   object             
 3   title            1593 non-null   object             
 4   text             1593 non-null   object             
 5   score            1593 non-null   int64              
 6   num_comments     1593 non-null   int64              
 7   link_flair_text  1

None


 Basic statistical info about all posts:



Unnamed: 0,score,num_comments,upvote_ratio
count,1593.0,1593.0,995.0
mean,369.907721,35.849341,0.853698
std,1333.332917,49.496192,0.160594
min,0.0,0.0,0.13
25%,25.0,7.0,0.79
50%,120.0,18.0,0.92
75%,544.0,44.0,0.97
max,49047.0,438.0,1.0




Sample of records in the all posts:


Unnamed: 0,post_id,timestamp,author,title,text,score,num_comments,link_flair_text,permalink,stickied,locked,upvote_ratio
0,1n7q2s8,2025-09-03 20:31:13+00:00,berry_pink,Price change in a few days and bad CSR,I was shopping for a new skirt as a treat-yo-s...,2,0,Discussion,https://www.reddit.com/r/lululemon/comments/1n...,False,False,1.0
1,1n7po2x,2025-09-03 20:16:03+00:00,runandplay2,In Store Try On (Rockwood / New define Track t...,Sorry for the double post today but thought ot...,14,1,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,1.0
2,1n7nh4p,2025-09-03 18:53:26+00:00,Puzzleheaded-Fan5586,Which bag for travel?,Looking for a safe crossbody bag for a Europea...,2,2,Product Question/Recommendation,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.75
3,1n7n9mb,2025-09-03 18:45:38+00:00,Ameeeekay,OOTD 🤍🖤,Fast and Free High-Rise Classic-Fit Split Shor...,29,1,Fit Pics,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.89
4,1n7n4e1,2025-09-03 18:40:03+00:00,Apprehensive_Place51,insulated os collared jacket sizing,could anyone give me sizing advice on this? i'...,1,4,Sizing Advice,https://www.reddit.com/r/lululemon/comments/1n...,False,False,0.66


### Extract comments

In [52]:
# Extract dataframe of comments for particular post

sample_comments_df = fetch_comments_for_post('1n70wil')

examine_df('sample comment dataframe', sample_comments_df)



Number of records in the sample comment dataframe is: 17


Number of features in the sample comment dataframe is: 10

The columns in the sample comment dataframe are: Index(['post_id', 'comment_id', 'timestamp', 'author', 'body', 'score',
       'is_submitter', 'parent_id', 'permalink', 'depth'],
      dtype='object')


 Other info about sample comment dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       17 non-null     object             
 1   comment_id    17 non-null     object             
 2   timestamp     17 non-null     datetime64[ns, UTC]
 3   author        17 non-null     object             
 4   body          17 non-null     object             
 5   score         17 non-null     int64              
 6   is_submitter  17 non-null     bool               
 7   parent_id     17 non-null   

None


 Basic statistical info about sample comment dataframe:



Unnamed: 0,score,depth
count,17.0,17.0
mean,2.941176,0.411765
std,1.983387,0.5073
min,1.0,0.0
25%,1.0,0.0
50%,2.0,0.0
75%,4.0,1.0
max,8.0,1.0




Sample of records in the sample comment dataframe:


Unnamed: 0,post_id,comment_id,timestamp,author,body,score,is_submitter,parent_id,permalink,depth
0,1n70wil,nc42dzh,2025-09-03 00:49:54+00:00,AutoModerator,Hello! This is a comment to let you know that ...,1,False,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0
1,1n70wil,nc45beg,2025-09-03 01:06:58+00:00,Jimmy_Philly_B-more,"Oh wow, absolutely love how this pairs with Ja...",8,False,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0
2,1n70wil,nc43rfz,2025-09-03 00:57:56+00:00,painthrowaway852,other colors I'm curious to pair with: Sequoia...,5,True,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0
3,1n70wil,nc4nj9l,2025-09-03 02:56:45+00:00,4merly-chicken,It looks like a colour that will pop with true...,5,False,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0
4,1n70wil,nc48ifr,2025-09-03 01:25:43+00:00,SpideyWhiplash,Love your pairings!💯 Always keep me excited ab...,4,False,t3_1n70wil,https://www.reddit.com/r/lululemon/comments/1n...,0


In [100]:
# Get all comments from all posts

all_post_ids = list(set(posts_df['post_id']))

comments_df = fetch_comments_from_posts(all_post_ids)

[1/1593] Collected 7 comments from post 1mvej12

[2/1593] Collected 3 comments from post 1mom9ja

[3/1593] Collected 16 comments from post 1n5v2mk

[4/1593] Collected 6 comments from post 1ml6jh1

[5/1593] Collected 13 comments from post wji4ng

[6/1593] Collected 5 comments from post 1gx00sn

[7/1593] Collected 2 comments from post 1mufl2e

[8/1593] Collected 2 comments from post 1mhttl3

[9/1593] Collected 16 comments from post 1g0tr6r

[10/1593] Collected 11 comments from post 1ghwx4t

[11/1593] Collected 137 comments from post 1f0ej9i

[12/1593] Collected 6 comments from post 1n4cutv

[13/1593] Collected 273 comments from post 10ahte7

[14/1593] Collected 41 comments from post 10hbk75

[15/1593] Collected 87 comments from post 1n5xs8x

[16/1593] Collected 85 comments from post 1mvw5dh

[17/1593] Collected 29 comments from post 12g5ozl

[18/1593] Collected 6 comments from post 1msmgde

[19/1593] Collected 44 comments from post 1ljbmpt

[20/1593] Collected 90 comments from post uz9rb

KeyboardInterrupt: 

In [66]:
# Examine comments dataframe

examine_df('comments dataframe', comments_df)



Number of records in the comments dataframe is: 15545


Number of features in the comments dataframe is: 10

The columns in the comments dataframe are: Index(['post_id', 'comment_id', 'timestamp', 'author', 'body', 'score',
       'is_submitter', 'parent_id', 'permalink', 'depth'],
      dtype='object')


 Other info about comments dataframe:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15545 entries, 0 to 15544
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype              
---  ------        --------------  -----              
 0   post_id       15545 non-null  object             
 1   comment_id    15545 non-null  object             
 2   timestamp     15545 non-null  datetime64[ns, UTC]
 3   author        15362 non-null  object             
 4   body          15545 non-null  object             
 5   score         15545 non-null  int64              
 6   is_submitter  15545 non-null  bool               
 7   parent_id     15545 non-null  object       

None


 Basic statistical info about comments dataframe:



Unnamed: 0,score,depth
count,15545.0,15545.0
mean,4.361016,0.794918
std,13.56825,1.118902
min,-56.0,0.0
25%,1.0,0.0
50%,2.0,0.0
75%,3.0,1.0
max,904.0,9.0




Sample of records in the comments dataframe:


Unnamed: 0,post_id,comment_id,timestamp,author,body,score,is_submitter,parent_id,permalink,depth
0,1n4i5ke,nbl94po,2025-08-31 01:27:30+00:00,AutoModerator,Hello! This is a comment to let you know that ...,1,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0
1,1n4i5ke,nbl9wdc,2025-08-31 01:32:16+00:00,Humble-Bus3726,What kind of dog tho!,63,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0
2,1n4i5ke,nblb2cw,2025-08-31 01:39:23+00:00,thevffice,omg the korok!!! lovelovelove 🥹🥹🥹 where did yo...,47,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0
3,1n4i5ke,nblbqh1,2025-08-31 01:43:32+00:00,Careful_Koala7995,I had a young girl with her friends tell me sh...,47,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0
4,1n4i5ke,nblbpiw,2025-08-31 01:43:22+00:00,Pd_unicorn,Love your outfit too!,10,False,t3_1n4i5ke,https://www.reddit.com/r/lululemon/comments/1n...,0


In [101]:
# Save posts dataframe as a csv

posts_df.to_csv(f"{PATH}/posts.csv", index=False)

In [70]:
# Save comments dataframe as a csv

comments_df.to_csv(f"{PATH}/comments.csv", index=False)