In [5]:
import requests, time, calendar
from datetime import datetime, timezone
import pandas as pd

BASE_URL = "https://arctic-shift.photon-reddit.com/api/posts/search"
BATCH_SIZE = 100  # API max typically 100

def to_epoch(dt):
    # accept datetime or 'YYYY-MM-DD' string
    if isinstance(dt, str):
        dt = datetime.strptime(dt, "%Y-%m-%d")
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return int(dt.timestamp())

def fetch_posts_batch(subreddit, after_epoch, before_epoch, limit=BATCH_SIZE, sort="desc", timeout=30):
    params = {
        "subreddit": subreddit,
        "after": after_epoch,    # epoch seconds
        "before": before_epoch,  # epoch seconds
        "limit": limit,
        "sort": sort             # newest first
    }
    r = requests.get(BASE_URL, params=params, timeout=timeout)
    r.raise_for_status()
    js = r.json()
    return js.get("data", []) if isinstance(js, dict) else []

def collect_posts_timeframe(subreddit, start_dt, end_dt, sleep_sec=0.8, verbose=True):
    """
    Collect ALL posts in [start_dt, end_dt) by paging backward using 'before'.
    start_dt, end_dt can be datetime or 'YYYY-MM-DD' strings.
    """
    after_epoch  = to_epoch(start_dt)
    before_epoch = to_epoch(end_dt)

    all_posts, seen = [], set()
    batch = 1
    while True:
        if verbose:
            print(f"Batch {batch}: after={after_epoch}  before={before_epoch}")

        posts = fetch_posts_batch(subreddit, after_epoch, before_epoch, BATCH_SIZE, "desc")
        if not posts:
            if verbose: print("No more posts returned.")
            break

        # de-dupe and append
        new = []
        for p in posts:
            pid = p.get("id")
            if pid and pid not in seen:
                seen.add(pid)
                new.append(p)
        if not new:
            if verbose: print("All posts in batch were duplicates.")
            break

        all_posts.extend(new)
        if verbose:
            print(f"Added {len(new)} (total {len(all_posts)})")

        # If batch smaller than limit, we've hit the end of this window
        if len(posts) < BATCH_SIZE:
            if verbose: print("Incomplete batch -> reached end of window.")
            break

        # Page backward: move 'before' to just before the oldest post
        oldest = min(p.get("created_utc", 0) or 0 for p in posts)
        if oldest <= 0:
            if verbose: print("Oldest timestamp missing -> stopping.")
            break

        before_epoch = oldest - 1  # step just before oldest to avoid duplicate
        if before_epoch <= after_epoch:
            if verbose: print("Reached start boundary.")
            break

        batch += 1
        time.sleep(sleep_sec)  # polite rate limit

    if verbose:
        print(f"Total unique posts collected: {len(all_posts)}")
    return all_posts

def process_posts_to_dataframe(raw_posts, subreddit, start_dt, end_dt, require_selftext=False):
    rows = []
    for p in raw_posts:
        pid = p.get("id"); ts = p.get("created_utc")
        if not pid or not ts: 
            continue
        if require_selftext and not (p.get("selftext") or "").strip():
            continue

        permalink = p.get("permalink") or f"/r/{subreddit}/comments/{pid}"
        if permalink.startswith("/"):
            permalink = f"https://www.reddit.com{permalink}"

        dt = datetime.fromtimestamp(ts, tz=timezone.utc).replace(tzinfo=None)
        rows.append({
            "post_id": pid,
            "title": p.get("title", ""),
            "text": p.get("selftext", ""),
            "author": p.get("author", "[deleted]"),
            "score": p.get("score", 0),
            "upvote_ratio": p.get("upvote_ratio", 0.0),
            "num_comments": p.get("num_comments", 0),
            "created_utc": ts,
            "created_datetime": dt,
            "subreddit": subreddit,
            "permalink": permalink
        })
    return pd.DataFrame(rows)

# ---------- Your monthly wrapper (uses the general timeframe collector) ----------
def collect_month(subreddit, month, year, **kwargs):
    start_dt = datetime(year, month, 1, tzinfo=timezone.utc)
    if month == 12:
        end_dt = datetime(year+1, 1, 1, tzinfo=timezone.utc)
    else:
        end_dt = datetime(year, month+1, 1, tzinfo=timezone.utc)
    raw = collect_posts_timeframe(subreddit, start_dt, end_dt, **kwargs)
    df = process_posts_to_dataframe(raw, subreddit, start_dt, end_dt, require_selftext=True)
    # add helpful month/year cols
    df["month"] = month
    df["year"]  = year
    return df

# We choose r/careeradvice so we can notice how (typically) answer-driven posts have an impact on user engagements (upvote score and comment count)
info_subreddits = ["careeradvice"]
subreddit_name = "careeradvice"
for sr in info_subreddits:
    for year in [2024, 2025]:
        start_m = 8 if year == 2023 else 1
        end_m   = 8 if year == 2025 else 12
        for m in range(start_m, end_m+1):
            df = collect_month(sr, m, year, sleep_sec=0.8, verbose=True)
            out = f"data/{subreddit_name}/{sr}_{year}_{m:02d}.csv"
            df.to_csv(out, index=False)
            print(f"Saved {len(df)} posts -> {out}")


Batch 1: after=1704067200  before=1706745600
Added 100 (total 100)
Batch 2: after=1704067200  before=1706676627
Added 100 (total 200)
Batch 3: after=1704067200  before=1706630271
Added 100 (total 300)
Batch 4: after=1704067200  before=1706542876
Added 100 (total 400)
Batch 5: after=1704067200  before=1706415017
Added 100 (total 500)
Batch 6: after=1704067200  before=1706290381
Added 100 (total 600)
Batch 7: after=1704067200  before=1706206765
Added 100 (total 700)
Batch 8: after=1704067200  before=1706113393
Added 100 (total 800)
Batch 9: after=1704067200  before=1706029898
Added 100 (total 900)
Batch 10: after=1704067200  before=1705953045
Added 100 (total 1000)
Batch 11: after=1704067200  before=1705854171
Added 100 (total 1100)
Batch 12: after=1704067200  before=1705711990
Added 100 (total 1200)
Batch 13: after=1704067200  before=1705615290
Added 100 (total 1300)
Batch 14: after=1704067200  before=1705530910
Added 100 (total 1400)
Batch 15: after=1704067200  before=1705445802
Added 

In [14]:
import pandas as pd
import os

path = "./data/careeradvice/"

csv_files = [f for f in os.listdir(path) if f.endswith(".csv")]

df_list = [ pd.read_csv(os.path.join(path, f)) for f in csv_files]
combined_df = pd.concat(df_list, ignore_index=True)

combined_df.to_csv(os.path.join(path, "combined_careeradvice_raw.csv"), index=False)