In [None]:
# import the necessary libraries
import os
import json
import time
from datetime import datetime, timezone

import praw
from praw.models import MoreComments
from prawcore.exceptions import NotFound, Forbidden, ServerError, ResponseException, RequestException 


In [None]:
# Enter your Reddit API credentials here
reddit = praw.Reddit(
    client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET",
    user_agent="script:ed-scraper:0.1 (by u/YOUR_USERNAME)",
)

# Check if the Reddit instance is read-only
print("read_only:", reddit.read_only)


In [None]:

# Function to convert a timestamp to ISO format
def iso(ts):
    return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()


In [None]:
# Function to estimate upvotes and downvotes from score and upvote ratio
def approx_votes(score, ratio):
    """
    Estimate ups and downs from score and upvote_ratio.
    WARNING: This is approximate; Reddit fuzzes/withholds data.
    Returns (ups, downs) or (None, None) if not computable.
    """
    try:
        if ratio is None or ratio == 0.5:
            return None, None
        n = score / (2 * ratio - 1)
        ups = int(round(ratio * n))
        downs = int(round((1 - ratio) * n))
        if ups < 0 or downs < 0:
            return None, None
        return ups, downs
    except ZeroDivisionError:
        return None, None
    except Exception:
        return None, None


In [None]:
# Function to safely convert author to string, handling None
def safe_author(a):
    return str(a) if a is not None else "[deleted]"


In [None]:
# Function to fetch a submission with its comments, expanding as needed
def fetch_submission_with_comments(subm, expand_more=8, max_comments=300):
    """
    expand_more: how many MoreComments nodes to expand (None=all, can be huge)
    max_comments: cap the total number of flattened comments we store
    """
    data = {
        "id": subm.id,
        "title": subm.title,
        "is_self": subm.is_self,
        "selftext": subm.selftext if subm.is_self else "",
        "url": subm.url,
        "permalink": f"https://www.reddit.com{subm.permalink}",
        "subreddit": str(subm.subreddit.display_name),
        "author": safe_author(subm.author),
        "created_utc": subm.created_utc,
        "created_iso": iso(subm.created_utc),
        "num_comments": subm.num_comments,
        "score": subm.score,
        "upvote_ratio": getattr(subm, "upvote_ratio", None),
        "approx_upvotes": None,
        "approx_downvotes": None,
        "comments": []
    }

    # Estimate upvotes and downvotes
    ups, downs = approx_votes(subm.score, getattr(subm, "upvote_ratio", None))
    data["approx_upvotes"] = ups
    data["approx_downvotes"] = downs

    # Expand some comment trees 
    try:
        subm.comments.replace_more(limit=expand_more)
        flat = subm.comments.list()
    except Exception as e:
        # If expansion fails, just use whatever we have
        flat = []
        print(f"  ! Could not expand comments for {subm.id}: {e}")

    # Cap total number of comments to store
    if max_comments is not None:
        flat = flat[:max_comments]

    for c in flat:
        if isinstance(c, MoreComments):
            continue
        try:
            data["comments"].append({
                "id": c.id,
                "author": safe_author(c.author),
                "body": c.body,
                "created_utc": c.created_utc,
                "created_iso": iso(c.created_utc),
                "score": c.score,
                "parent_id": c.parent_id,
                "link_id": c.link_id
            })
        except Exception as e:
            # Some comments may fail to serialize (e.g., deleted/removed)
            print(f"    ! Skipping a comment in {subm.id}: {e}")

    return data


In [None]:
# Collect authors from a subreddit 
seed_subreddit = "EDanonymemes"  # Change this to your target subreddit
authors = set()
for submission in reddit.subreddit(seed_subreddit).new(limit=40):
    a = safe_author(submission.author)
    if a not in ("[deleted]", "None", "AutoModerator"):
        authors.add(a)


In [None]:
# Sort authors to have a consistent order
authors = sorted(authors)
print(f"Collected {len(authors)} authors from r/{seed_subreddit}")

with open("authors.json", "w", encoding="utf-8") as f:
    json.dump(authors, f, ensure_ascii=False, indent=2)


In [None]:
#  Map authors to subreddits (as you had), and collect post/comment data 
user_subreddits = {}
user_posts = {}  

In [None]:
# Adjust the server load for ethical crawling
MAX_SUBMISSIONS_PER_USER = 50
EXPAND_MORE_COMMENTS = 8     # None = all (can explode), 0 = only already-loaded
MAX_COMMENTS_PER_POST = 300  # cap
PAUSE_BETWEEN_USERS = 1.0    # seconds, to be nice to the API

for i, username in enumerate(authors, 1):
    print(f"\n[{i}/{len(authors)}] Fetching for u/{username} ...")
    sub_list = []
    posts_list = []
    try:
        redditor = reddit.redditor(username)
        # Iterate through this user's recent submissions
        for subm in redditor.submissions.new(limit=MAX_SUBMISSIONS_PER_USER):
            sname = str(subm.subreddit.display_name)
            if sname not in sub_list:
                sub_list.append(sname)

            # Build rich post + comments record
            try:
                post_data = fetch_submission_with_comments(
                    subm,
                    expand_more=EXPAND_MORE_COMMENTS,
                    max_comments=MAX_COMMENTS_PER_POST
                )
                posts_list.append(post_data)
            except (Forbidden, NotFound) as e:
                print(f"  ! Skipping a post (private/forbidden/not found): {e}")
            except (ServerError, ResponseException, RequestException) as e:
                print(f"  ! Transient error on a post: {e}")
                time.sleep(2)

        user_subreddits[username] = sub_list
        user_posts[username] = {"submissions": posts_list}

    except (Forbidden, NotFound) as e:
        print(f"Skipping u/{username}: {e}")
    except (ServerError, ResponseException, RequestException) as e:
        print(f"Transient error for u/{username}: {e}")
        time.sleep(2)
    finally:
        time.sleep(PAUSE_BETWEEN_USERS)



In [None]:
# Save results 
with open("user_subreddits.json", "w", encoding="utf-8") as f:
    json.dump(user_subreddits, f, ensure_ascii=False, indent=2)

with open("user_posts_with_comments.json", "w", encoding="utf-8") as f:
    json.dump(user_posts, f, ensure_ascii=False, indent=2)

print(
    f"\nSaved subreddit mapping for {len(user_subreddits)} users "
    f"and rich post+comment data to user_posts_with_comments.json"
)