## Import Statements

In [7]:
import pandas as pd
import os
import requests
from datetime import datetime, timedelta
import time

In [12]:
subreddit_communities = ["r/Science", "r/changemyview", "r/AmItheAsshole"]
API_URL = "https://arctic-shift.photon-reddit.com/api/posts/search"
data_dir = "./raw_data"
os.makedirs(data_dir, exist_ok=True)

In [16]:
def fetch_posts(subreddit, after, before, limit=100):
    params = {
        "subreddit": subreddit.replace("r/", ""),
        "after": after, "before": before,
        "limit": limit, "sort": "desc"
    }
    try:
        r = requests.get(API_URL, params=params, timeout=30)
        r.raise_for_status()
        return r.json().get("data", [])
    except Exception as e:
        print(f"[fetch err] {subreddit} {after}→{before}: {e}")
        return []

def minmax_norm(series):
    smin, smax = series.min(), series.max()
    if pd.isna(smin) or pd.isna(smax) or smax == smin:
        return pd.Series([0.5] * len(series), index=series.index)  # flat month → neutral 0.5
    return (series - smin) / (smax - smin)

def collect_month(subreddit, year, month):
    # month bounds
    start = datetime(year, month, 1)
    end   = datetime(year + (month == 12), (month % 12) + 1, 1)
    after, before = start.strftime("%Y-%m-%d"), end.strftime("%Y-%m-%d")

    seen, rows, cursor = set(), [], before
    while True:
        batch = fetch_posts(subreddit, after, cursor, limit=100)
        if not batch:
            break
        for p in batch:
            pid = p.get("id")
            if pid and pid not in seen:
                seen.add(pid)
                rows.append(p)
        oldest_ts = min(p.get("created_utc", 0) for p in batch)
        cursor = datetime.fromtimestamp(oldest_ts).strftime("%Y-%m-%d")
        if cursor <= after:
            break
        time.sleep(1)  # be nice to the API

    if not rows:
        return pd.DataFrame()

    df = pd.DataFrame(rows)
    df["year"], df["month"] = year, month

    # 50/50 popularity = normalized score (50%) + normalized comments (50%)
    if {"score", "num_comments"}.issubset(df.columns):
        df["score_norm"] = minmax_norm(df["score"].astype("float"))
        df["comments_norm"] = minmax_norm(df["num_comments"].astype("float"))
        df["popularity"] = 0.5 * df["score_norm"] + 0.5 * df["comments_norm"]
        df = df.sort_values("popularity", ascending=False).head(50).reset_index(drop=True)
    else:
        # fallback: if fields missing, keep top 50 by score
        df = df.sort_values("score", ascending=False).head(50).reset_index(drop=True)

    return df

def last_n_months(n=24):
    # step back month-by-month from the first of the current month
    anchor = datetime.utcnow().replace(day=1)
    y, m = anchor.year, anchor.month
    for _ in range(n):
        yield y, m
        m -= 1
        if m == 0:
            m, y = 12, y - 1

for sub in subreddit_communities:
    sub_name = sub.replace("r/", "")
    out_dir = os.path.join(data_dir, sub_name)
    os.makedirs(out_dir, exist_ok=True)

    for y, m in last_n_months(24):
        print(f"{sub}  {y}-{m:02d}")
        dfm = collect_month(sub, y, m)
        if dfm.empty:
            print("  (no data)")
            continue

        out_path = os.path.join(out_dir, f"{y}-{m:02d}_top50.csv")
        dfm[[
            "id","title","selftext","author","score","num_comments","upvote_ratio",
            "popularity","created_utc","permalink","year","month"
        ]].to_csv(out_path, index=False)
        print(f"  saved {len(dfm)} → {out_path}")

  anchor = datetime.utcnow().replace(day=1)


r/Science  2025-10
  saved 50 → ./raw_data/Science/2025-10_top50.csv
r/Science  2025-09
  saved 50 → ./raw_data/Science/2025-09_top50.csv
r/Science  2025-08
  saved 50 → ./raw_data/Science/2025-08_top50.csv
r/Science  2025-07
  saved 50 → ./raw_data/Science/2025-07_top50.csv
r/Science  2025-06
  saved 50 → ./raw_data/Science/2025-06_top50.csv
r/Science  2025-05
  saved 50 → ./raw_data/Science/2025-05_top50.csv
r/Science  2025-04
  saved 50 → ./raw_data/Science/2025-04_top50.csv
r/Science  2025-03
  saved 50 → ./raw_data/Science/2025-03_top50.csv
r/Science  2025-02
  saved 50 → ./raw_data/Science/2025-02_top50.csv
r/Science  2025-01
  saved 50 → ./raw_data/Science/2025-01_top50.csv
r/Science  2024-12
  saved 50 → ./raw_data/Science/2024-12_top50.csv
r/Science  2024-11
  saved 50 → ./raw_data/Science/2024-11_top50.csv
r/Science  2024-10
  saved 50 → ./raw_data/Science/2024-10_top50.csv
r/Science  2024-09
  saved 50 → ./raw_data/Science/2024-09_top50.csv
r/Science  2024-08
  saved 50 → ./

  anchor = datetime.utcnow().replace(day=1)


  saved 50 → ./raw_data/changemyview/2025-10_top50.csv
r/changemyview  2025-09
  saved 50 → ./raw_data/changemyview/2025-09_top50.csv
r/changemyview  2025-08
  saved 50 → ./raw_data/changemyview/2025-08_top50.csv
r/changemyview  2025-07
  saved 50 → ./raw_data/changemyview/2025-07_top50.csv
r/changemyview  2025-06
  saved 50 → ./raw_data/changemyview/2025-06_top50.csv
r/changemyview  2025-05
  saved 50 → ./raw_data/changemyview/2025-05_top50.csv
r/changemyview  2025-04
  saved 50 → ./raw_data/changemyview/2025-04_top50.csv
r/changemyview  2025-03
  saved 50 → ./raw_data/changemyview/2025-03_top50.csv
r/changemyview  2025-02
  saved 50 → ./raw_data/changemyview/2025-02_top50.csv
r/changemyview  2025-01
  saved 50 → ./raw_data/changemyview/2025-01_top50.csv
r/changemyview  2024-12
  saved 50 → ./raw_data/changemyview/2024-12_top50.csv
r/changemyview  2024-11
  saved 50 → ./raw_data/changemyview/2024-11_top50.csv
r/changemyview  2024-10
  saved 50 → ./raw_data/changemyview/2024-10_top50.c

  anchor = datetime.utcnow().replace(day=1)


  saved 50 → ./raw_data/AmItheAsshole/2025-10_top50.csv
r/AmItheAsshole  2025-09
  saved 50 → ./raw_data/AmItheAsshole/2025-09_top50.csv
r/AmItheAsshole  2025-08
  saved 50 → ./raw_data/AmItheAsshole/2025-08_top50.csv
r/AmItheAsshole  2025-07
  saved 50 → ./raw_data/AmItheAsshole/2025-07_top50.csv
r/AmItheAsshole  2025-06
  saved 50 → ./raw_data/AmItheAsshole/2025-06_top50.csv
r/AmItheAsshole  2025-05
  saved 50 → ./raw_data/AmItheAsshole/2025-05_top50.csv
r/AmItheAsshole  2025-04
  saved 50 → ./raw_data/AmItheAsshole/2025-04_top50.csv
r/AmItheAsshole  2025-03
  saved 50 → ./raw_data/AmItheAsshole/2025-03_top50.csv
r/AmItheAsshole  2025-02
  saved 50 → ./raw_data/AmItheAsshole/2025-02_top50.csv
r/AmItheAsshole  2025-01
  saved 50 → ./raw_data/AmItheAsshole/2025-01_top50.csv
r/AmItheAsshole  2024-12
  saved 50 → ./raw_data/AmItheAsshole/2024-12_top50.csv
r/AmItheAsshole  2024-11
  saved 50 → ./raw_data/AmItheAsshole/2024-11_top50.csv
r/AmItheAsshole  2024-10
  saved 50 → ./raw_data/AmIt