In [1]:
import os
import re
import time
import math
import pandas as pd
import praw
from typing import List, Tuple, Dict

In [3]:
# -----------------------------
# AUTH (use environment vars)
# -----------------------------
def reddit_client():
    return praw.Reddit(
        client_id="vkf1gQoROd-7TWrnmXRtRg",
        client_secret="bJrKNBRU9HfXqSXOf2K7Wp-Mp3iIgQ",
        user_agent="LETF Scraper for MSc Thesis by u/the_ronnows", # A descriptive user_agent is required
        username="the_ronnows",
        password=os.environ.get('Jule9sse') # Reads the password securely
    )

reddit = reddit_client()
print(f"Authenticated as: u/{reddit.user.me()}")

Authenticated as: u/None


In [4]:
# Category patterns for inclusion, following your thesis design for comparative categories
CATEGORY_KEYWORDS: Dict[str, List[str]] = {
    "LETF": [
        r"\bUPRO\b", r"\bTQQQ\b", r"\bSSO\b", r"\bQLD\b",
        r"\bTMF\b", r"\bDBPG\b", r"\b3EUL\b", r"\b3QQQ\b",
        r"\bleveraged etf\b", r"\bletf\b", r"\bvolatility decay\b",
        r"\bleverage for the long run\b", r"\bHFEA\b",
        r"\b2x SMA\b", r"\b3x SMA\b", r"\b9sig\b", r"\b200d SMA\b",
        r"\bleverage rotation\b", r"\bdaily leveraged\b"
    ],
    "Tech": [
        r"\bQQQ\b", r"\bXLK\b", r"\bVGT\b",
        r"\btech etf\b", r"\btechnology etf\b", r"\bnasdaq 100\b"
    ],
    "Semiconductors": [
        r"\bSOXX\b", r"\bSMH\b", r"\bXSD\b",
        r"\bsemiconductor etf\b", r"\bchip etf\b"
    ],
    "Commodities": [
        r"\bGLD\b", r"\bIAU\b", r"\bDBC\b",
        r"\bcommodity etf\b", r"\bbroad commodity etf\b", r"\bgold etf\b",
    ],
    "Broad": [
        r"\bSPY\b", r"\bVOO\b", r"\bVT\b", r"\bACWI\b",
        r"\bworld etf\b", r"\bworld market etf\b", r"\bmarket etf\b"
    ],
}

# Optional long-term anchoring signals — can be toggled off in the scrape call
LONG_TERM_ANCHORS = [
    r"\blong[- ]term\b", r"\bbuy and hold\b", r"\bDCA\b", r"\bdollar[- ]cost\b",
    r"\bretirement\b", r"\bcore holding\b", r"\bfor decades\b", r"\bISA\b", r"\b401k\b"
]

# Exclusions to reduce day-trading/noise and option-chatter
GLOBAL_EXCLUSIONS = [
    r"\bday[- ]trade\b", r"\bintraday\b", r"\b0DTE\b", r"\bweekly options\b",
    r"\boptions\b", r"\bYOLO\b", r"\bswing trade\b", r"\bprice target\b",
    r"\bscalp\b", r"\bshort squeeze\b", r"\blottery\b"
]

# When scraping non-LETF categories, exclude explicit LETF tickers to keep the comparison cohorts cleaner
LETF_EXCLUSIONS = [
    r"\bTQQQ\b", r"\bUPRO\b", r"\bSSO\b", r"\bQLD\b", r"\bSOXL\b", r"\bTECL\b",
    r"\bSPXL\b", r"\b3x\b", r"\bleveraged etf\b", r"\bLETF\b", r"\bvolatility decay\b"
]

# Optional extra exclusions for commodities if needed
COMMODITY_EXCLUSIONS = [
    r"\bUGAZ\b", r"\bDGAZ\b", r"\bBOIL\b", r"\bKOLD\b",
    r"\bUCO\b", r"\bSCO\b", r"\bbitcoin\b", r"\bcrypto\b"
]

In [5]:
def compile_any(patterns: List[str]) -> re.Pattern:
    if not patterns:
        # match nothing
        return re.compile(r"a^")
    return re.compile("|".join(patterns), re.IGNORECASE)

def matches_any(text: str, patterns_re: re.Pattern) -> bool:
    return bool(patterns_re.search(text or ""))

def build_filters_for_category(category: str) -> Tuple[re.Pattern, re.Pattern, re.Pattern]:
    inc = compile_any(CATEGORY_KEYWORDS[category])
    anchors = compile_any(LONG_TERM_ANCHORS)
    excl_patterns = GLOBAL_EXCLUSIONS + ([] if category == "LETF" else LETF_EXCLUSIONS)
    if category == "Commodities":
        excl_patterns = GLOBAL_EXCLUSIONS + COMMODITY_EXCLUSIONS + ([] if category == "LETF" else LETF_EXCLUSIONS)
    excl = compile_any(excl_patterns)
    return inc, anchors, excl

In [6]:
def scrape_category(
    reddit: praw.Reddit,
    asset_category: str,
    subreddits: List[str],
    submission_search_limit: int = 500,
    per_thread_comment_cap: int = 150,
    min_comment_score: int = 0,
    time_filter: str = "all",        # 'all' | 'year' | 'month' | 'week'
    require_long_term_anchor: bool = True,
    output_dir: str = "../data/raw/reddit_data",
) -> pd.DataFrame:
    os.makedirs(output_dir, exist_ok=True)

    include_re, anchors_re, exclude_re = build_filters_for_category(asset_category)

    rows = []
    seen_submissions = set()

    for sr in subreddits:
        print(f"\n=== Scraping r/{sr} for category '{asset_category}' ===")
        sub = reddit.subreddit(sr)

        # Build a concise query to satisfy Reddit's search constraints
        query_terms = [p.strip(r"\b").replace("\\b", "") for p in CATEGORY_KEYWORDS[asset_category] if " " not in p]
        query = " OR ".join(sorted(set(query_terms[:8]))) or asset_category  # keep short

        try:
            search_iter = sub.search(query, sort="new", time_filter=time_filter, limit=submission_search_limit)
        except Exception as e:
            print(f"Search failed for r/{sr}: {e}")
            continue

        for submission in search_iter:
            if submission.id in seen_submissions:
                continue
            seen_submissions.add(submission.id)

            post_text = f"{submission.title}\n{submission.selftext or ''}"

            try:
                submission.comments.replace_more(limit=None)
            except Exception:
                # removed/rate-limit issue: skip this thread
                continue

            kept = 0
            for c in submission.comments.list():
                if kept >= per_thread_comment_cap:
                    break
                if getattr(c, "body", None) in ("[deleted]", "[removed]", None):
                    continue

                text = c.body

                # Inclusion: category mention in comment OR post
                if not matches_any(text, include_re) and not matches_any(post_text, include_re):
                    continue

                # Optional anchor requirement to bias toward long-horizon discourse
                if require_long_term_anchor and not (matches_any(text, anchors_re) or matches_any(post_text, anchors_re)):
                    continue

                # Exclusions
                if matches_any(text, exclude_re) or matches_any(post_text, exclude_re):
                    continue

                if c.score is not None and c.score < min_comment_score:
                    continue

                rows.append({
                    "asset_category": asset_category,
                    "subreddit": sr,
                    "submission_id": submission.id,
                    "submission_title": submission.title,
                    "comment_id": c.id,
                    "body": text,
                    "created_utc": getattr(c, "created_utc", None),
                    "score": getattr(c, "score", None),
                    "permalink": f"https://www.reddit.com{getattr(c, 'permalink', '')}",
                })
                kept += 1

            time.sleep(0.1)  # rate-friendliness

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.drop_duplicates(subset=["subreddit", "submission_id", "comment_id"])
        outpath = os.path.join(
            output_dir, f"{asset_category.lower()}_{'-'.join([s.lower() for s in subreddits])}.csv"
        )
        df.to_csv(outpath, index=False)
        print(f"\nSaved {len(df):,} comments to {outpath}")
    else:
        print("\nNo comments matched filters.")
    return df

In [8]:
# Configure cohorts and run
SUBREDDITS = ["Investing", "ETFs", "Bogleheads"]  # include r/LETFs for in-group discussion
CATEGORIES = ["LETF", "Tech", "Semiconductors", "Commodities", "Broad"]

# Collection knobs
SUBMISSION_SEARCH_LIMIT = 400
PER_THREAD_COMMENT_CAP = 100
MIN_COMMENT_SCORE = 0
TIME_FILTER = "all"                  # change to 'all' to expand
REQUIRE_LONG_TERM_ANCHOR = True       # set False to broaden collection

dfs = []
for cat in CATEGORIES:
    df_cat = scrape_category(
        reddit=reddit,
        asset_category=cat,
        subreddits=SUBREDDITS,
        submission_search_limit=SUBMISSION_SEARCH_LIMIT,
        per_thread_comment_cap=PER_THREAD_COMMENT_CAP,
        min_comment_score=MIN_COMMENT_SCORE,
        time_filter=TIME_FILTER,
        require_long_term_anchor=REQUIRE_LONG_TERM_ANCHOR,
        output_dir="../data/raw/reddit_data",
    )
    if not df_cat.empty:
        dfs.append(df_cat)

df_all = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame(columns=[
    "asset_category","subreddit","submission_id","submission_title",
    "comment_id","body","created_utc","score","permalink"
])

print(f"\nTotal collected across categories: {len(df_all):,}")

# Master dump to support downstream NLP/regression phases
master_out = "../data/raw/reddit_data/reddit_master_longterm_anchored.csv" if REQUIRE_LONG_TERM_ANCHOR else "../data/raw/reddit_data/reddit_master.csv"
df_all.to_csv(master_out, index=False)
print(f"Saved combined dataset to: {master_out}")

# Preview
df_all.head(10)


=== Scraping r/Investing for category 'LETF' ===

=== Scraping r/ETFs for category 'LETF' ===

=== Scraping r/Bogleheads for category 'LETF' ===

Saved 5,121 comments to ../data/raw/reddit_data/letf_investing-etfs-bogleheads.csv

=== Scraping r/Investing for category 'Tech' ===

=== Scraping r/ETFs for category 'Tech' ===

=== Scraping r/Bogleheads for category 'Tech' ===

Saved 2,218 comments to ../data/raw/reddit_data/tech_investing-etfs-bogleheads.csv

=== Scraping r/Investing for category 'Semiconductors' ===

=== Scraping r/ETFs for category 'Semiconductors' ===

=== Scraping r/Bogleheads for category 'Semiconductors' ===

Saved 2,952 comments to ../data/raw/reddit_data/semiconductors_investing-etfs-bogleheads.csv

=== Scraping r/Investing for category 'Commodities' ===

=== Scraping r/ETFs for category 'Commodities' ===

=== Scraping r/Bogleheads for category 'Commodities' ===

Saved 2,116 comments to ../data/raw/reddit_data/commodities_investing-etfs-bogleheads.csv

=== Scrapin

Unnamed: 0,asset_category,subreddit,submission_id,submission_title,comment_id,body,created_utc,score,permalink
0,LETF,Investing,1o4yopa,Does anyone hedge their portfolios with invers...,nj5xftk,Daily rebalanced leveraged inverse ETFs have a...,1760302000.0,4,https://www.reddit.com/r/investing/comments/1o...
1,LETF,Investing,1o4yopa,Does anyone hedge their portfolios with invers...,nj9gr5i,inverse ETFs are intended for short-term tradi...,1760359000.0,2,https://www.reddit.com/r/investing/comments/1o...
2,LETF,Investing,1o4yopa,Does anyone hedge their portfolios with invers...,nlvxnzm,You're buying an asset with negative expected ...,1761682000.0,1,https://www.reddit.com/r/investing/comments/1o...
3,LETF,Investing,1nui8ye,"10-Year Hold Plan with NBIS, DPRO, and HGRAF –...",nh1svaj,"Added nbis to my forever hold, now ballooned t...",1759256000.0,2,https://www.reddit.com/r/investing/comments/1n...
4,LETF,Investing,1nui8ye,"10-Year Hold Plan with NBIS, DPRO, and HGRAF –...",nh422vy,Your submission has been automatically removed...,1759282000.0,1,https://www.reddit.com/r/investing/comments/1n...
5,LETF,Investing,1n7uw7b,Whats wrong with a 23 year old investing into ...,ncabok0,"High expense ratios, volatility decay",1756944000.0,293,https://www.reddit.com/r/investing/comments/1n...
6,LETF,Investing,1n7uw7b,Whats wrong with a 23 year old investing into ...,ncaczbp,S&P 500 vs. S&P 500 3x over the past 140 years...,1756944000.0,98,https://www.reddit.com/r/investing/comments/1n...
7,LETF,Investing,1n7uw7b,Whats wrong with a 23 year old investing into ...,ncaby74,Probably not a good idea. Leveraged ETF's are ...,1756944000.0,119,https://www.reddit.com/r/investing/comments/1n...
8,LETF,Investing,1n7uw7b,Whats wrong with a 23 year old investing into ...,ncakl2l,2x is better for buy and hold than 3x if you h...,1756947000.0,8,https://www.reddit.com/r/investing/comments/1n...
9,LETF,Investing,1n7uw7b,Whats wrong with a 23 year old investing into ...,ncciu6n,"""Twenty-three year old men who 'invest' all th...",1756981000.0,9,https://www.reddit.com/r/investing/comments/1n...
