In [None]:
# @title 0) Setup: Install dependencies
!pip -q install praw pandas tqdm python-dotenv nltk

# Download VADER lexicon for the optional baseline sentiment step
import nltk
nltk.download('vader_lexicon')

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/189.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m184.3/189.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.3/189.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
# @title 1) Configure credentials and collection settings
REDDIT_CLIENT_ID = "_TS6P6JDpmD1IWv4da_HUg"  # @param {type:"string"}
REDDIT_CLIENT_SECRET = "L6h2i_DECNhzawlHEdW1lhTRnDNfqw"  # @param {type:"string"}
REDDIT_USER_AGENT = "finance-sentiment-colab by diego"  # @param {type:"string"}

# Subreddits to collect from (finance domain):
SUBREDDITS = ["stocks", "investing", "wallstreetbets", "pennystocks", "FinancialPlanning"]  # @param {type:"raw"}

# Days of history to keep (submissions older than this are skipped)
DAYS_BACK = 60  

# Max submissions per subreddit (upper bound)
LIMIT_PER_SUB = 500  # @param {type:"number"}

# Fetch comments for each post?
FETCH_COMMENTS = True  # @param {type:"boolean"}

# Limit of comments per post (to control volume)
MAX_COMMENTS_PER_POST = 200  # @param {type:"number"}

# Output file names
OUTPUT_SUBMISSIONS_CSV = "reddit_finance_submissions.csv"  # @param {type:"string"}
OUTPUT_COMMENTS_CSV = "reddit_finance_comments.csv"  # @param {type:"string"}

# Keyword list to tag finance-relevant text (feel free to edit/expand)
FINANCE_KEYWORDS = [
    "inflation","interest rate","hike","cut","earnings","guidance","dividend","recession",
    "gdp","cpi","ppi","fed","fomc","quantitative easing","qt","treasury","bond","yield",
    "etf","spy","nvda","aapl","tsla","bank","loan","credit","default","liquidity",
    "volatility","vix","valuation","pe ratio","cash flow"
]

In [None]:
# Initialize Reddit client and helpers
import re
import time
import pandas as pd
from datetime import datetime, timedelta, timezone
from tqdm import tqdm
import praw
from praw.models import MoreComments

# Validate credentials
if not (REDDIT_CLIENT_ID and REDDIT_CLIENT_SECRET and REDDIT_USER_AGENT):
    raise RuntimeError("Missing Reddit API credentials. Please fill in REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT in the previous cell.")

# Instantiate PRAW client
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT
)

# Precompile keyword regex
KEYWORD_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, FINANCE_KEYWORDS)) + r")\b", re.IGNORECASE)

def utc_iso(ts: float) -> str:
    return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()

def find_keywords(text: str) -> list:
    if not text:
        return []
    return sorted(set(m.group(0).lower() for m in KEYWORD_REGEX.finditer(text or "")))

def submission_to_row(s) -> dict:
    text = ((s.title or "") + "\n" + (s.selftext or "")).strip()
    hits = find_keywords(text)
    return {
        "type": "submission",
        "submission_id": s.id,
        "comment_id": None,
        "author": str(s.author) if s.author else None,
        "created_utc": utc_iso(s.created_utc),
        "subreddit": str(s.subreddit),
        "score": s.score,
        "num_comments": s.num_comments,
        "upvote_ratio": s.upvote_ratio,
        "title": s.title,
        "selftext": s.selftext,
        "comment_body": None,
        "permalink": f"https://www.reddit.com{s.permalink}",
        "url": s.url,
        "keyword_hits": ";".join(hits),
    }

def comment_to_row(s, c) -> dict:
    body = c.body or ""
    hits = find_keywords(body)
    return {
        "type": "comment",
        "submission_id": s.id,
        "comment_id": c.id,
        "author": str(c.author) if c.author else None,
        "created_utc": utc_iso(c.created_utc),
        "subreddit": str(s.subreddit),
        "score": c.score,
        "num_comments": None,
        "upvote_ratio": None,
        "title": s.title,
        "selftext": s.selftext,
        "comment_body": body,
        "permalink": f"https://www.reddit.com{c.permalink}",
        "url": s.url,
        "keyword_hits": ";".join(hits),
    }

In [None]:
# Collect submissions and comments
SLEEP_BETWEEN_REQUESTS = 0.7  # be kind to rate limits
cutoff = datetime.now(timezone.utc) - timedelta(days=DAYS_BACK)

sub_rows = []
com_rows = []

for sub in SUBREDDITS:
    subreddit = reddit.subreddit(sub)
    submissions = subreddit.top(time_filter="year", limit=LIMIT_PER_SUB)

    for s in tqdm(submissions, desc=f"Collecting r/{sub}"):
        try:
            created = datetime.fromtimestamp(s.created_utc, tz=timezone.utc)
            if created < cutoff:
                continue

            sub_rows.append(submission_to_row(s))
            time.sleep(SLEEP_BETWEEN_REQUESTS)

            if FETCH_COMMENTS:
                s.comments.replace_more(limit=0)
                count = 0
                for c in s.comments.list():
                    if isinstance(c, MoreComments):
                        continue
                    com_rows.append(comment_to_row(s, c))
                    count += 1
                    if count >= MAX_COMMENTS_PER_POST:
                        break

                time.sleep(SLEEP_BETWEEN_REQUESTS)

        except Exception as e:
            print(f"[warn] Skipping item due to error: {e}")
            time.sleep(2)

sub_df = pd.DataFrame(sub_rows)
com_df = pd.DataFrame(com_rows)

# Deduplicate just in case
if not sub_df.empty:
    sub_df = sub_df.drop_duplicates(subset=["submission_id"])
if not com_df.empty:
    com_df = com_df.drop_duplicates(subset=["comment_id"])

sub_df.to_csv(OUTPUT_SUBMISSIONS_CSV, index=False)
com_df.to_csv(OUTPUT_COMMENTS_CSV, index=False)

print(f"Saved {len(sub_df)} submissions -> {OUTPUT_SUBMISSIONS_CSV}")
print(f"Saved {len(com_df)} comments -> {OUTPUT_COMMENTS_CSV}")

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/l

Saved 154 submissions -> reddit_finance_submissions.csv
Saved 22286 comments -> reddit_finance_comments.csv


In [None]:
# Quick sanity checks
import pandas as pd

sub_df = pd.read_csv(OUTPUT_SUBMISSIONS_CSV)
print("Submissions head:")
display(sub_df.head(3))

com_df = pd.read_csv(OUTPUT_COMMENTS_CSV)
print("Comments head:")
display(com_df.head(3))

print("Keyword hit rates:")
print("Submissions with hits:", (sub_df["keyword_hits"].fillna("") != "").mean())
print("Comments with hits:", (com_df["keyword_hits"].fillna("") != "").mean())

Submissions head:


Unnamed: 0,type,submission_id,comment_id,author,created_utc,subreddit,score,num_comments,upvote_ratio,title,selftext,comment_body,permalink,url,keyword_hits
0,submission,1nm9pun,,callsonreddit,2025-09-20T21:11:53+00:00,stocks,12606,978,0.93,Disney+ and Hulu face mass cancellations after...,No paywall: [https://www.usatoday.com/story/en...,,https://www.reddit.com/r/stocks/comments/1nm9p...,https://www.reddit.com/r/stocks/comments/1nm9p...,earnings
1,submission,1o3d10v,,Gamingwishard,2025-10-10T21:06:01+00:00,stocks,11043,1797,0.94,BREAKING: Trump places 100% tariff on China st...,The S&P 500 falls 70+ points in seconds after ...,,https://www.reddit.com/r/stocks/comments/1o3d1...,https://www.reddit.com/r/stocks/comments/1o3d1...,
2,submission,1nxbr66,,WickedSensitiveCrew,2025-10-03T21:28:56+00:00,stocks,7267,426,0.98,The U.S. dollar fell about 11% against other c...,https://www.morganstanley.com/insights/article...,,https://www.reddit.com/r/stocks/comments/1nxbr...,https://www.reddit.com/r/stocks/comments/1nxbr...,inflation


Comments head:


Unnamed: 0,type,submission_id,comment_id,author,created_utc,subreddit,score,num_comments,upvote_ratio,title,selftext,comment_body,permalink,url,keyword_hits
0,comment,1nm9pun,nfbhq7d,Silver_Crypto_Duh,2025-09-20T21:55:38+00:00,stocks,911,,,Disney+ and Hulu face mass cancellations after...,No paywall: [https://www.usatoday.com/story/en...,"Damn, Disney really knows how to piss everyone...",https://www.reddit.com/r/stocks/comments/1nm9p...,https://www.reddit.com/r/stocks/comments/1nm9p...,
1,comment,1nm9pun,nfbazuo,booooimaghost,2025-09-20T21:17:59+00:00,stocks,1300,,,Disney+ and Hulu face mass cancellations after...,No paywall: [https://www.usatoday.com/story/en...,Lol maybe this is what will bring the right an...,https://www.reddit.com/r/stocks/comments/1nm9p...,https://www.reddit.com/r/stocks/comments/1nm9p...,
2,comment,1nm9pun,nfbcopm,Puzzled-Rip641,2025-09-20T21:27:24+00:00,stocks,562,,,Disney+ and Hulu face mass cancellations after...,No paywall: [https://www.usatoday.com/story/en...,I canceled. Fuck Disney and fuck the FCC,https://www.reddit.com/r/stocks/comments/1nm9p...,https://www.reddit.com/r/stocks/comments/1nm9p...,


Keyword hit rates:
Submissions with hits: 0.35714285714285715
Comments with hits: 0.07933231625235573


In [None]:
# Lightweight baseline sentiment (VADER)
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

def vader_sentiment(text: str) -> float:
    if not isinstance(text, str):
        return 0.0
    return sia.polarity_scores(text)["compound"]

# Score submissions based on title + selftext
sub_df = pd.read_csv(OUTPUT_SUBMISSIONS_CSV)
sub_df["text_for_sentiment"] = (sub_df["title"].fillna("") + "\n" + sub_df["selftext"].fillna("")).str.strip()
sub_df["vader_compound"] = sub_df["text_for_sentiment"].apply(vader_sentiment)

# Score comments
com_df = pd.read_csv(OUTPUT_COMMENTS_CSV)
com_df["vader_compound"] = com_df["comment_body"].fillna("").apply(vader_sentiment)

sub_df.to_csv("reddit_finance_submissions_with_sentiment.csv", index=False)
com_df.to_csv("reddit_finance_comments_with_sentiment.csv", index=False)

print("Saved:")
print(" - reddit_finance_submissions_with_sentiment.csv")
print(" - reddit_finance_comments_with_sentiment.csv")

print("\nSubmission sentiment summary:")
display(sub_df["vader_compound"].describe())

print("\nComment sentiment summary:")
display(com_df["vader_compound"].describe())

Saved:
 - reddit_finance_submissions_with_sentiment.csv
 - reddit_finance_comments_with_sentiment.csv

Submission sentiment summary:


Unnamed: 0,vader_compound
count,154.0
mean,0.385249
std,0.582492
min,-0.9804
25%,0.0
50%,0.59585
75%,0.931325
max,0.9992



Comment sentiment summary:


Unnamed: 0,vader_compound
count,22286.0
mean,0.098361
std,0.429294
min,-0.9976
25%,0.0
50%,0.0
75%,0.4215
max,0.9981
