In [11]:
# import the necessary libraries
import os
import json
import time
from datetime import datetime, timezone
import csv

import praw
from praw.models import MoreComments
from prawcore.exceptions import NotFound, Forbidden, ServerError, ResponseException, RequestException 


In [None]:
# Enter your Reddit API credentials here
reddit = praw.Reddit(
    client_id="YOUR_CLIENT_ID",
    client_secret="YOUR_CLIENT_SECRET",
    user_agent="script:ed-scraper:0.1 (by u/YOUR_USERNAME)",
)

# Check if the Reddit instance is read-only
print("read_only:", reddit.read_only)


read_only: True


In [13]:

# Function to convert a timestamp to ISO format
def iso(ts):
    return datetime.fromtimestamp(ts, tz=timezone.utc).isoformat()


In [14]:
# Function to estimate upvotes and downvotes from score and upvote ratio
def approx_votes(score, ratio):
    """
    Estimate ups and downs from score and upvote_ratio.
    WARNING: This is approximate; Reddit fuzzes/withholds data.
    Returns (ups, downs) or (None, None) if not computable.
    """
    try:
        if ratio is None or ratio == 0.5:
            return None, None
        n = score / (2 * ratio - 1)
        ups = int(round(ratio * n))
        downs = int(round((1 - ratio) * n))
        if ups < 0 or downs < 0:
            return None, None
        return ups, downs
    except ZeroDivisionError:
        return None, None
    except Exception:
        return None, None


In [15]:
# Function to safely convert author to string, handling None
def safe_author(a):
    return str(a) if a is not None else "[deleted]"


In [16]:
# Function to fetch a submission with its comments, expanding as needed
def fetch_submission_with_comments(subm, expand_more=8, max_comments=300):
    """
    expand_more: how many MoreComments nodes to expand (None=all, can be huge)
    max_comments: cap the total number of flattened comments we store
    """
    data = {
        "id": subm.id,
        "title": subm.title,
        "is_self": subm.is_self,
        "selftext": subm.selftext if subm.is_self else "",
        "url": subm.url,
        "permalink": f"https://www.reddit.com{subm.permalink}",
        "subreddit": str(subm.subreddit.display_name),
        "author": safe_author(subm.author),
        "created_utc": subm.created_utc,
        "created_iso": iso(subm.created_utc),
        "num_comments": subm.num_comments,
        "score": subm.score,
        "upvote_ratio": getattr(subm, "upvote_ratio", None),
        "approx_upvotes": None,
        "approx_downvotes": None,
        "comments": []
    }

    # Estimate upvotes and downvotes
    ups, downs = approx_votes(subm.score, getattr(subm, "upvote_ratio", None))
    data["approx_upvotes"] = ups
    data["approx_downvotes"] = downs

    # Expand some comment trees 
    try:
        subm.comments.replace_more(limit=expand_more)
        flat = subm.comments.list()
    except Exception as e:
        # If expansion fails, just use whatever we have
        flat = []
        print(f"  ! Could not expand comments for {subm.id}: {e}")

    # Cap total number of comments to store
    if max_comments is not None:
        flat = flat[:max_comments]

    for c in flat:
        if isinstance(c, MoreComments):
            continue
        try:
            data["comments"].append({
                "id": c.id,
                "author": safe_author(c.author),
                "body": c.body,
                "created_utc": c.created_utc,
                "created_iso": iso(c.created_utc),
                "score": c.score,
                "parent_id": c.parent_id,
                "link_id": c.link_id
            })
        except Exception as e:
            # Some comments may fail to serialize (e.g., deleted/removed)
            print(f"    ! Skipping a comment in {subm.id}: {e}")

    return data


In [17]:
# Collect authors from a subreddit 
seed_subreddit = "EDanonymemes"  # Change this to your target subreddit
authors = set()
for submission in reddit.subreddit(seed_subreddit).new(limit=40):
    a = safe_author(submission.author)
    if a not in ("[deleted]", "None", "AutoModerator"):
        authors.add(a)


In [18]:
# Sort authors to have a consistent order
authors = sorted(authors)
print(f"Collected {len(authors)} authors from r/{seed_subreddit}")

with open("authors.json", "w", encoding="utf-8") as f:
    json.dump(authors, f, ensure_ascii=False, indent=2)


Collected 37 authors from r/EDanonymemes


In [19]:
#  Map authors to subreddits (as you had), and collect post/comment data 
user_subreddits = {}
user_posts = {}  

In [20]:
# Adjust the server load for ethical crawling
MAX_SUBMISSIONS_PER_USER = 50
EXPAND_MORE_COMMENTS = 8     # None = all (can explode), 0 = only already-loaded
MAX_COMMENTS_PER_POST = 300  # cap
PAUSE_BETWEEN_USERS = 1.0    # seconds, to be nice to the API

for i, username in enumerate(authors, 1):
    print(f"\n[{i}/{len(authors)}] Fetching for u/{username} ...")
    sub_list = []
    posts_list = []
    try:
        redditor = reddit.redditor(username)
        # Iterate through this user's recent submissions
        for subm in redditor.submissions.new(limit=MAX_SUBMISSIONS_PER_USER):
            sname = str(subm.subreddit.display_name)
            if sname not in sub_list:
                sub_list.append(sname)

            # Build rich post + comments record
            try:
                post_data = fetch_submission_with_comments(
                    subm,
                    expand_more=EXPAND_MORE_COMMENTS,
                    max_comments=MAX_COMMENTS_PER_POST
                )
                posts_list.append(post_data)
            except (Forbidden, NotFound) as e:
                print(f"  ! Skipping a post (private/forbidden/not found): {e}")
            except (ServerError, ResponseException, RequestException) as e:
                print(f"  ! Transient error on a post: {e}")
                time.sleep(2)

        user_subreddits[username] = sub_list
        user_posts[username] = {"submissions": posts_list}

    except (Forbidden, NotFound) as e:
        print(f"Skipping u/{username}: {e}")
    except (ServerError, ResponseException, RequestException) as e:
        print(f"Transient error for u/{username}: {e}")
        time.sleep(2)
    finally:
        time.sleep(PAUSE_BETWEEN_USERS)




[1/37] Fetching for u/-mosaicaxolotl- ...

[2/37] Fetching for u/Ashamed_Ad8162 ...

[3/37] Fetching for u/BloomingInTheVoid ...

[4/37] Fetching for u/Correct_Fig_6198 ...

[5/37] Fetching for u/Entire_Weather3209 ...

[6/37] Fetching for u/GuilefulEyes ...

[7/37] Fetching for u/Impurest_Vessel ...

[8/37] Fetching for u/KlausMikaelsonsWife ...

[9/37] Fetching for u/LaaaaMaaaa ...

[10/37] Fetching for u/New-Desk1419 ...

[11/37] Fetching for u/No_Astronomer656 ...

[12/37] Fetching for u/Quirky-Reception7087 ...

[13/37] Fetching for u/Snap-Crackle-Plop- ...

[14/37] Fetching for u/Squidd_Vicious ...

[15/37] Fetching for u/WoebegoneWoodlouse ...

[16/37] Fetching for u/_AroAce_in_space_ ...

[17/37] Fetching for u/acoolrock ...

[18/37] Fetching for u/alexisseffy ...

[19/37] Fetching for u/aqua4cry ...

[20/37] Fetching for u/birb-jesus ...

[21/37] Fetching for u/cupidhurts ...

[22/37] Fetching for u/funkydyke ...

[23/37] Fetching for u/garje ...

[24/37] Fetching for u/hunkt

In [None]:
# Save results in JSON files
with open("user_subreddits.json", "w", encoding="utf-8") as f:
    json.dump(user_subreddits, f, ensure_ascii=False, indent=2)

with open("user_posts_with_comments.json", "w", encoding="utf-8") as f:
    json.dump(user_posts, f, ensure_ascii=False, indent=2)

print(
    f"\nSaved subreddit mapping for {len(user_subreddits)} users "
    f"and rich post+comment data to user_posts_with_comments.json"
)

In [None]:
# Save results in CSV files
# CSV export (combined + separate) 
COMBINED_CSV = "crawl_combined.csv"
POSTS_CSV = "posts_only.csv"
COMMENTS_CSV = "comments_only.csv"

# Predefine consistent column orders
post_cols = [
    "seed_user",              # the user whose profile we crawled
    "post_id",
    "post_title",
    "post_is_self",
    "post_selftext",
    "post_url",
    "post_permalink",
    "post_subreddit",
    "post_author",
    "post_created_utc",
    "post_created_iso",
    "post_score",
    "post_upvote_ratio",
    "post_approx_upvotes",
    "post_approx_downvotes",
    "post_num_comments",
]
comment_cols = [
    "comment_id",
    "comment_author",
    "comment_body",
    "comment_created_utc",
    "comment_created_iso",
    "comment_score",
    "comment_parent_id",
    "comment_link_id",
]
combined_cols = ["record_type"] + post_cols + comment_cols  # record_type = 'post' or 'comment'

combined_rows = []
post_rows = []
comment_rows = []

for seed_user, pdata in user_posts.items():
    for post in pdata.get("submissions", []):
        # ---- Post row
        p_row = {
            "seed_user": seed_user,
            "post_id": post.get("id"),
            "post_title": post.get("title"),
            "post_is_self": post.get("is_self"),
            "post_selftext": post.get("selftext"),
            "post_url": post.get("url"),
            "post_permalink": post.get("permalink"),
            "post_subreddit": post.get("subreddit"),
            "post_author": post.get("author"),
            "post_created_utc": post.get("created_utc"),
            "post_created_iso": post.get("created_iso"),
            "post_score": post.get("score"),
            "post_upvote_ratio": post.get("upvote_ratio"),
            "post_approx_upvotes": post.get("approx_upvotes"),
            "post_approx_downvotes": post.get("approx_downvotes"),
            "post_num_comments": post.get("num_comments"),
        }
        post_rows.append(p_row)

        # Add to combined with empty comment fields
        combined_rows.append(
            {"record_type": "post", **p_row,
             **{k: "" for k in comment_cols}}
        )

        # ---- Comment rows for this post
        for c in post.get("comments", []):
            c_row = {
                # comment fields
                "comment_id": c.get("id"),
                "comment_author": c.get("author"),
                "comment_body": c.get("body"),
                "comment_created_utc": c.get("created_utc"),
                "comment_created_iso": c.get("created_iso"),
                "comment_score": c.get("score"),
                "comment_parent_id": c.get("parent_id"),
                "comment_link_id": c.get("link_id"),
                # keep post context alongside each comment (useful in 1-file analysis)
                "seed_user": seed_user,
                "post_id": post.get("id"),
                "post_title": post.get("title"),
                "post_is_self": post.get("is_self"),
                "post_selftext": post.get("selftext"),
                "post_url": post.get("url"),
                "post_permalink": post.get("permalink"),
                "post_subreddit": post.get("subreddit"),
                "post_author": post.get("author"),
                "post_created_utc": post.get("created_utc"),
                "post_created_iso": post.get("created_iso"),
                "post_score": post.get("score"),
                "post_upvote_ratio": post.get("upvote_ratio"),
                "post_approx_upvotes": post.get("approx_upvotes"),
                "post_approx_downvotes": post.get("approx_downvotes"),
                "post_num_comments": post.get("num_comments"),
            }
            # For the separate comments-only file, keep only comment columns
            comment_rows.append({k: c_row[k] for k in comment_cols})

            # For the combined file, align to combined_cols
            combined_rows.append(
                {
                    "record_type": "comment",
                    **{k: c_row.get(k) for k in post_cols},
                    **{k: c_row.get(k) for k in comment_cols},
                }
            )

# Write CSVs
def write_csv(path, fieldnames, rows):
    with open(path, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        writer.writeheader()
        writer.writerows(rows)

# posts_only.csv
write_csv(POSTS_CSV, post_cols, post_rows)

# comments_only.csv
write_csv(COMMENTS_CSV, comment_cols, comment_rows)

# crawl_combined.csv (posts + comments in one long table)
write_csv(COMBINED_CSV, combined_cols, combined_rows)

print(f"CSV exports written:\n  - {COMBINED_CSV}\n  - {POSTS_CSV}\n  - {COMMENTS_CSV}")


CSV exports written:
  - crawl_combined.csv
  - posts_only.csv
  - comments_only.csv
