<a href="https://colab.research.google.com/github/danielmlow/tutorials/blob/main/text/reddit_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install praw

Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Collecting websocket-client>=0.54.0 (from praw)
  Downloading websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting requests<3.0,>=2.6.0 (from prawcore<3,>=2.4->praw)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests<3.0,>=2.6.0->prawcore<3,>=2.4->praw)
  Using cached charset_normalizer-3.4.1-cp310-cp310-macosx_10_9_universal2.whl.metadata (35 kB)
Collecting idna<4,>=2.5 (from requests<3.0,>=2.6.0->prawcore<3,>=2.4->praw)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests<3.0,>=2.6.0->prawcore<3,>=2.4->praw)
  Using cached urllib3-2.3.0-py3-none-any.whl.metadata (

Search for similar reddits here:
https://anvaka.github.io/sayit/?query=GriefSupport

In [None]:
sample_size = 6000

subreddits = {
'self_harm': 'selfharm',
'suicide': 'SuicideWatch',
'bully': 'bullying',
'abuse_physical': 'abusesurvivors',
'abuse_sexual': 'sexualassault',
'relationship': 'relationship_advice',
'bereavement': 'GriefSupport',
'isolated': 'lonely',
'anxiety': 'Anxiety',
'depressed': 'depression',
'gender': 'asktransgender',
'eating': 'EatingDisorders',
'substance': 'addiction'
}




In [None]:
import praw
import pandas as pd
from datetime import datetime, timezone
import time
from typing import List, Optional
import api_keys

In [None]:
class RedditSampler:
    def __init__(self, client_id: str, client_secret: str, user_agent: str):
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )

    def get_samples(
        self,
        subreddits: List[str],
        sample_size: int = 6000,
        sleep_amount: float = 0.1,
        submission_type: str = "all"
    ) -> pd.DataFrame:
        all_submissions = []
        sorts = ['new', 'top', 'controversial', 'hot']
        time_filters = ['all', 'year', 'month', 'week']

        for subreddit_name in subreddits:
            try:
                subreddit = self.reddit.subreddit(subreddit_name)
                sub_samples = set()  # Using set to avoid duplicates

                for sort in sorts:
                    if len(sub_samples) >= sample_size:
                        break

                    for time_filter in time_filters:
                        if len(sub_samples) >= sample_size:
                            break

                        try:
                            if sort == 'new':
                                submissions = subreddit.new(limit=1000)
                            elif sort == 'hot':
                                submissions = subreddit.hot(limit=1000)
                            else:
                                submissions = getattr(subreddit, sort)(time_filter=time_filter, limit=1000)

                            for submission in submissions:
                                if len(sub_samples) >= sample_size:
                                    break

                                if submission_type != "all":
                                    if submission_type == "self" and not submission.is_self:
                                        continue
                                    if submission_type == "link" and submission.is_self:
                                        continue

                                sub_dict = {
                                    'subreddit': subreddit_name,
                                    'id': submission.id,
                                    'title': submission.title,
                                    'author': str(submission.author),
                                    'created_utc': datetime.fromtimestamp(submission.created_utc),
                                    'score': submission.score,
                                    'upvote_ratio': submission.upvote_ratio,
                                    'num_comments': submission.num_comments,
                                    'url': submission.url,
                                    'is_self': submission.is_self,
                                    'selftext': submission.selftext if submission.is_self else None,
                                    'sort_method': sort,
                                    'time_filter': time_filter if sort != 'new' and sort != 'hot' else None
                                }

                                sub_samples.add(tuple(sub_dict.items()))
                                time.sleep(sleep_amount)

                            print(f"Collected {len(sub_samples)} samples from r/{subreddit_name} ({sort}/{time_filter})")
                            time.sleep(1)

                        except Exception as e:
                            print(f"Error with {sort}/{time_filter}: {str(e)}")
                            continue

                # Convert back to list of dicts
                sub_samples_list = [dict(items) for items in sub_samples]
                all_submissions.extend(sub_samples_list[:sample_size])

            except Exception as e:
                print(f"Error collecting from r/{subreddit_name}: {str(e)}")
                continue

        return pd.DataFrame(all_submissions)

In [None]:
subreddits_subset = list(subreddits.values())[2:]

# Example usage:
if __name__ == "__main__":
    # You'll need to get these from your Reddit API application
    CLIENT_ID = api_keys.reddit_client_id
    CLIENT_SECRET = api_keys.reddit_secret
    USER_AGENT = f"script:data_sampler:v1.0 (by /u/{api_keys.reddit_username})"

    sampler = RedditSampler(CLIENT_ID, CLIENT_SECRET, USER_AGENT)



    # Get samples
    samples_df = sampler.get_samples(
        subreddits=subreddits_subset,
        sample_size=sample_size,
        submission_type="all"
    )

Collected 970 samples from r/bullying (new/all)
Collected 1449 samples from r/bullying (new/year)
Collected 1793 samples from r/bullying (new/month)
Collected 2085 samples from r/bullying (new/week)
Collected 3085 samples from r/bullying (top/all)
Collected 4062 samples from r/bullying (top/year)
Collected 4264 samples from r/bullying (top/month)
Collected 4295 samples from r/bullying (top/week)
Collected 5292 samples from r/bullying (controversial/all)
Collected 6000 samples from r/bullying (controversial/year)
Collected 973 samples from r/abusesurvivors (new/all)
Collected 1364 samples from r/abusesurvivors (new/year)
Collected 1652 samples from r/abusesurvivors (new/month)
Collected 1868 samples from r/abusesurvivors (new/week)
Collected 2866 samples from r/abusesurvivors (top/all)
Collected 3850 samples from r/abusesurvivors (top/year)
Collected 4000 samples from r/abusesurvivors (top/month)
Collected 4038 samples from r/abusesurvivors (top/week)
Collected 5037 samples from r/abuse

In [None]:
samples_df

Unnamed: 0,subreddit,id,title,author,created_utc,score,upvote_ratio,num_comments,url,is_self,selftext,sort_method,time_filter
0,bullying,pov7nf,I was bullied in high school and to this day n...,Maroshne,2021-09-15 13:51:46,17,1.00,10,https://www.reddit.com/r/bullying/comments/pov...,True,In high school I was bullied and decided to hi...,top,all
1,bullying,1dy9ut1,I was a bully in school/college,CameraSubject1653,2024-07-08 10:09:08,4,0.67,9,https://www.reddit.com/r/bullying/comments/1dy...,True,I used to bully a few people in school/college...,controversial,all
2,bullying,1gd0fv1,Anyone else hate it when people/former classma...,Turbulent_Poem6,2024-10-26 21:49:45,15,0.94,6,https://www.reddit.com/r/bullying/comments/1gd...,True,I got bullied from plenty of people from my ol...,new,
3,bullying,1ijplzm,help,EfficiencySeparate39,2025-02-07 02:31:13,2,0.75,7,https://www.reddit.com/r/bullying/comments/1ij...,True,"People call me short everyday, and I am called...",top,month
4,bullying,1hmie3z,Family Member as Your Biggest Bully,erain4062,2024-12-26 01:11:02,7,1.00,4,https://www.reddit.com/r/bullying/comments/1hm...,True,I’ve been fortunate to have kind and compassio...,new,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54433,addiction,1iiaoto,20M I Broke My Girlfriend's 20F Trust by Relap...,ThrowRA_Tree3711,2025-02-05 09:06:41,0,0.50,2,https://www.reddit.com/r/addiction/comments/1i...,True,"Hi, this is a throwaway acount. I (20M) need s...",new,
54434,addiction,1cm32qv,What counts as feeding a porn addiction,,2024-05-07 00:11:43,0,0.50,4,https://www.reddit.com/r/addiction/comments/1c...,True,I’m trying to quite using porn but I still wan...,controversial,all
54435,addiction,1iiptyu,Starting a Sober Living House in Indy,Known_Moment5558,2025-02-05 19:42:19,1,1.00,1,https://www.reddit.com/r/addiction/comments/1i...,True,"Hello All,\n\nI want to make a house I bought ...",new,
54436,addiction,1iak5hu,Struggling today,SuperbMixt,2025-01-26 12:16:04,1,1.00,4,https://www.reddit.com/r/addiction/comments/1i...,True,About 4 weeks abstaining. Things really kicked...,top,month


In [None]:
from datetime import datetime
# now
now = datetime.now()
format = '%y-%m-%dT%H-%M-%S'
date_string = now.strftime(format)
date_string

'25-02-21T10-37-52'

In [None]:
samples_df['subreddit'].value_counts()

subreddit
bullying               6000
abusesurvivors         6000
sexualassault          6000
relationship_advice    6000
depression             6000
asktransgender         6000
EatingDisorders        6000
addiction              6000
Anxiety                3287
GriefSupport           3151
Name: count, dtype: int64

In [None]:
samples_df['title_text'] = samples_df['title']+"\n---\n"+samples_df['selftext']

# Save to CSV
samples_df.to_csv(f"data/input/reddit_10_mental_health_{date_string}_incomplete.csv", index=False)
print(f"Saved {len(samples_df)} total samples to reddit_samples.csv")

Saved 54438 total samples to reddit_samples.csv
