# Reddit Finance Posts (PRAW)

Minimal pipeline: search finance-related posts in selected subreddits, clean text, normalize fields, and write a single JSON file (array).

Set environment variables: `REDDIT_CLIENT_ID`, `REDDIT_CLIENT_SECRET`, `REDDIT_USER_AGENT`. Optionally create a `.env` at repo root.

In [None]:
# Parameters
SUBREDDITS = ['wallstreetbets', 'stocks', 'investing', 'finance']
KEYWORDS = ['stock', 'stocks', 'market', 'earnings', 'fed', 'inflation', 'rate hike', 'nvda', 'tsla', 'aapl']
TIME_FILTER = 'week'   # 'hour','day','week','month','year','all'
LIMIT_PER_SUBREDDIT = 400
OUTPUT_DIR = '../data/processed/reddit'  # Relative to notebooks/ directory
RUN_ID = None  # if None, uses UTC YYYY-MM-DD


In [5]:
# !pip -q install praw python-dotenv
import os, json, re, pathlib, datetime as dt
from typing import Dict, Any, List, Optional

try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

import praw

def today_str():
    return dt.datetime.utcnow().strftime('%Y-%m-%d')

RUN_ID = RUN_ID or today_str()
RUN_DIR = str(pathlib.Path(OUTPUT_DIR) / RUN_ID)
pathlib.Path(RUN_DIR).mkdir(parents=True, exist_ok=True)

CLIENT_ID = os.getenv('REDDIT_CLIENT_ID')
CLIENT_SECRET = os.getenv('REDDIT_CLIENT_SECRET')
USER_AGENT = os.getenv('REDDIT_USER_AGENT', 'investor-sentiment-dashboard/0.1 by script')
assert CLIENT_ID and CLIENT_SECRET and USER_AGENT, 'Missing Reddit creds. Set REDDIT_CLIENT_ID/SECRET/USER_AGENT.'

reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT,
    check_for_async=False,
)
print('Reddit client ready (read-only):', reddit.read_only)


Reddit client ready (read-only): True


In [6]:
_URL_RE = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def clean_text(txt: str) -> str:
    """Remove URLs and collapse whitespace."""
    if not txt:
        return ''
    txt = _URL_RE.sub('', txt)
    txt = re.sub(r'\s+', ' ', txt)
    return txt.strip()


In [7]:
def normalize_post(submission) -> Dict[str, Any]:
    """Extract and normalize fields from a PRAW submission."""
    return {
        'id': submission.id,
        'title': clean_text(submission.title),
        'selftext': clean_text(submission.selftext),
        'author': str(submission.author) if submission.author else '[deleted]',
        'subreddit': str(submission.subreddit),
        'created_utc': int(submission.created_utc),
        'score': submission.score,
        'num_comments': submission.num_comments,
        'upvote_ratio': getattr(submission, 'upvote_ratio', None),
        'url': submission.url,
        'permalink': f"https://www.reddit.com{submission.permalink}",
    }


In [8]:
def build_query(keywords: List[str]) -> str:
    terms = [f'"{k}"' if ' ' in k else k for k in keywords]
    return ' OR '.join(terms)

def fetch_posts_for_subreddit(name: str, limit: int, time_filter: str, keywords: List[str]) -> List[Dict[str, Any]]:
    sr = reddit.subreddit(name)
    q = build_query(keywords)
    seen = set()
    rows: List[Dict[str, Any]] = []
    for s in sr.search(q, sort='new', time_filter=time_filter, limit=limit):
        if s.id in seen:
            continue
        seen.add(s.id)
        rows.append(normalize_post(s))
    return rows


In [9]:
all_rows: List[Dict[str, Any]] = []
for sub in SUBREDDITS:
    print(f'Fetching r/{sub} ...')
    rows = fetch_posts_for_subreddit(sub, LIMIT_PER_SUBREDDIT, TIME_FILTER, KEYWORDS)
    print(f'  Found {len(rows)} posts')
    all_rows.extend(rows)

# Deduplicate
by_id = {r['id']: r for r in all_rows}
deduped = list(by_id.values())
deduped.sort(key=lambda x: x.get('created_utc') or 0, reverse=True)
print('Total unique posts:', len(deduped))

out_path = os.path.join(RUN_DIR, f'reddit_finance_{RUN_ID}.json')
with open(out_path, 'w', encoding='utf-8') as f:
    json.dump(deduped, f, ensure_ascii=False)
print('Wrote:', out_path)


Fetching r/wallstreetbets ...
  Found 115 posts
Fetching r/stocks ...
  Found 115 posts
Fetching r/stocks ...
  Found 140 posts
Fetching r/investing ...
  Found 140 posts
Fetching r/investing ...
  Found 128 posts
Fetching r/finance ...
  Found 128 posts
Fetching r/finance ...
  Found 1 posts
Total unique posts: 384
Wrote: data\processed\reddit\2025-10-22\reddit_finance_2025-10-22.json
  Found 1 posts
Total unique posts: 384
Wrote: data\processed\reddit\2025-10-22\reddit_finance_2025-10-22.json
