In [1]:
# --- config & imports ---
import os
import time
import math
import pathlib
from datetime import datetime, timezone

import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import praw
from praw.models import MoreComments

# Load .env (expects REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT)
load_dotenv()
CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
CLIENT_SECRET = os.getenv("REDDIT_CLIENT_SECRET")
USER_AGENT = os.getenv("REDDIT_USER_AGENT", "crypto-sentiment-scraper/0.1")

# Sanity checks
assert CLIENT_ID and CLIENT_SECRET and USER_AGENT, "Missing Reddit env vars. Set REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT in .env"

# Create API client (read-only)
reddit = praw.Reddit(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    user_agent=USER_AGENT,
)

# --- parameters you can tweak ---
SUBS = ["CryptoCurrency", "Bitcoin"]
MODE = "top"               # "hot" | "new" | "top"
TIME_FILTER = "day"        # for .top(): "hour" | "day" | "week" | "month" | "year" | "all"
POSTS_PER_SUB = 300        # how many submissions per subreddit
FETCH_COMMENTS = True      # set False if you only want posts

# Output folder with date stamp
out_dir = pathlib.Path("reddit_data") / datetime.now(timezone.utc).strftime("%Y%m%d")
out_dir.mkdir(parents=True, exist_ok=True)

def utc_to_iso(ts_utc: float) -> str:
    return datetime.fromtimestamp(ts_utc, tz=timezone.utc).isoformat()

def get_listing(sr, mode, limit, time_filter):
    if mode == "hot":
        return sr.hot(limit=limit)
    elif mode == "new":
        return sr.new(limit=limit)
    elif mode == "top":
        return sr.top(time_filter=time_filter, limit=limit)
    else:
        raise ValueError("MODE must be one of: hot, new, top")

# ---- scrape posts ----
post_rows = []
for sub in SUBS:
    sr = reddit.subreddit(sub)
    print(f"\nFetching {MODE} posts from r/{sub} (limit={POSTS_PER_SUB}, time_filter={TIME_FILTER if MODE=='top' else '—'})")

    for s in tqdm(get_listing(sr, MODE, POSTS_PER_SUB, TIME_FILTER), total=POSTS_PER_SUB):
        post_rows.append({
            "subreddit": sub,
            "post_id": s.id,
            "created_utc": s.created_utc,
            "created_iso": utc_to_iso(s.created_utc),
            "author": getattr(s.author, "name", None),
            "title": s.title,
            "selftext": s.selftext if hasattr(s, "selftext") else None,
            "score": s.score,
            "upvote_ratio": getattr(s, "upvote_ratio", None),
            "num_comments": s.num_comments,
            "permalink": f"https://reddit.com{s.permalink}",
            "url": s.url,                      # external link if any
            "over_18": s.over_18,
            "stickied": s.stickied,
            "locked": s.locked,
            "spoiler": getattr(s, "spoiler", None),
            "flair": getattr(s, "link_flair_text", None),
        })
        # optional tiny pause to be polite (PRAW handles rate limit; this is extra-safe)
        time.sleep(0.05)

posts_df = pd.DataFrame(post_rows)
# sort newest first
if not posts_df.empty:
    posts_df = posts_df.sort_values("created_utc", ascending=False).reset_index(drop=True)

# Save posts
posts_csv = out_dir / f"reddit_posts_{MODE}_{TIME_FILTER if MODE=='top' else 'na'}.csv"
posts_parquet = out_dir / f"reddit_posts_{MODE}_{TIME_FILTER if MODE=='top' else 'na'}.parquet"
posts_df.to_csv(posts_csv, index=False)
posts_df.to_parquet(posts_parquet, index=False)
print(f"\nSaved {len(posts_df)} posts → {posts_csv}")

# ---- optionally scrape comments for the collected posts ----
if FETCH_COMMENTS and not posts_df.empty:
    comment_rows = []
    print("\nFetching comments…")
    # modest cap to avoid huge runs; adjust as needed
    MAX_COMMENTS_PER_POST = 200

    for _, row in tqdm(posts_df.iterrows(), total=len(posts_df)):
        submission = reddit.submission(id=row["post_id"])
        # replace MoreComments to get full tree (can be slow); use a small limit for speed
        submission.comments.replace_more(limit=0)
        count = 0
        for c in submission.comments.list():
            if isinstance(c, MoreComments):
                continue
            comment_rows.append({
                "subreddit": row["subreddit"],
                "post_id": row["post_id"],
                "comment_id": c.id,
                "created_utc": c.created_utc,
                "created_iso": utc_to_iso(c.created_utc),
                "author": getattr(c.author, "name", None),
                "body": c.body,
                "score": c.score,
                "permalink": f"https://reddit.com{c.permalink}",
                "is_submitter": getattr(c, "is_submitter", None),
            })
            count += 1
            if count >= MAX_COMMENTS_PER_POST:
                break
        # micro-sleep between posts
        time.sleep(0.1)

    comments_df = pd.DataFrame(comment_rows)
    if not comments_df.empty:
        comments_df = comments_df.sort_values("created_utc", ascending=False).reset_index(drop=True)
        comments_csv = out_dir / f"reddit_comments_{MODE}_{TIME_FILTER if MODE=='top' else 'na'}.csv"
        comments_parquet = out_dir / f"reddit_comments_{MODE}_{TIME_FILTER if MODE=='top' else 'na'}.parquet"
        comments_df.to_csv(comments_csv, index=False)
        comments_df.to_parquet(comments_parquet, index=False)
        print(f"Saved {len(comments_df)} comments → {comments_csv}")
    else:
        print("No comments collected (empty set).")


Fetching top posts from r/CryptoCurrency (limit=300, time_filter=day)


 20%|█▉        | 59/300 [00:05<00:23, 10.13it/s]



Fetching top posts from r/Bitcoin (limit=300, time_filter=day)


 17%|█▋        | 51/300 [00:03<00:16, 15.46it/s]



Fetching top posts from r/ethereum (limit=300, time_filter=day)


  1%|          | 2/300 [00:00<00:47,  6.28it/s]



Fetching top posts from r/ethtrader (limit=300, time_filter=day)


  5%|▍         | 14/300 [00:01<00:24, 11.67it/s]



Fetching top posts from r/cryptomarkets (limit=300, time_filter=day)


  9%|▉         | 27/300 [00:01<00:19, 14.08it/s]



Saved 153 posts → reddit_data/20250818/reddit_posts_top_day.csv

Fetching comments…


100%|██████████| 153/153 [01:06<00:00,  2.28it/s]

Saved 3056 comments → reddit_data/20250818/reddit_comments_top_day.csv



