In [2]:
# !pip install praw

Search for similar reddits here:
https://anvaka.github.io/sayit/?query=GriefSupport

In [2]:
sample_size = 6000

subreddits = {
'self_harm': 'selfharm',
'suicide': 'SuicideWatch',
'bully': 'bullying',
'abuse_physical': 'abusesurvivors',
'abuse_sexual': 'sexualassault',
'relationship': 'relationship_advice',
'bereavement': 'GriefSupport',
'isolated': 'lonely',
'anxiety': 'Anxiety',
'depressed': 'depression',
'gender': 'askLGBT', # chosen over LGBT because the latter is more memes support seeking
'eating': 'EatingDisorders',
'substance': 'addiction'
}




In [3]:
import praw
import pandas as pd
from datetime import datetime, timezone
import time
from typing import List, Optional
import api_keys

In [59]:
subreddits = {'self_harm': 'selfharm',
 'suicide': 'SuicideWatch',
 'bully': 'bullying',
 'abuse_physical': 'abusesurvivors',
 'abuse_sexual': 'sexualassault',
 'relationship': 'relationship_advice',
 'bereavement': 'GriefSupport',
 'isolated': 'lonely',
 'anxiety': 'Anxiety',
 'depressed': 'depression',
 'gender': 'AskLGBT',
 'eating': 'EatingDisorders',
 'substance': 'addiction'}

In [75]:
datetime = 1

In [80]:
import pandas as pd
import praw
import time
import random
from datetime import datetime as dt  # Import as dt to avoid confusion
from typing import List, Dict, Set, Optional

class RedditSampler:
    def __init__(self, client_id: str, client_secret: str, user_agent: str):
        self.reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent
        )

    def get_samples(
        self,
        subreddits: List[str],
        sample_size: int = 6000,
        sleep_amount: float = 0.1,
        submission_type: str = "all",
        existing_data: Optional[pd.DataFrame] = None
    ) -> pd.DataFrame:
        all_submissions = []
        
        # Create a dictionary to track existing post IDs per subreddit
        existing_ids = {}
        if existing_data is not None:
            for subreddit in subreddits:
                # Get all post IDs for this subreddit from existing data
                sub_data = existing_data[existing_data['subreddit'] == subreddit]
                existing_ids[subreddit] = set(sub_data['id'].unique())
                
                # Print stats about existing data
                print(f"Found {len(existing_ids[subreddit])} existing posts for r/{subreddit}")
        else:
            # Initialize empty sets if no existing data
            for subreddit in subreddits:
                existing_ids[subreddit] = set()
        
        # Sort methods prioritized to get more varied content
        sorts = ['top', 'controversial', 'hot', 'new']  # Changed order to prefer older posts
        time_filters = ['all', 'year', 'month', 'week', 'day']  # Added 'day' for more granularity
        
        # For each subreddit, track which post IDs we've already tried to use as "before" markers
        # to avoid getting stuck in the same posts
        used_before_ids = {subreddit: set() for subreddit in subreddits}
        
        for subreddit_name in subreddits:
            try:
                # Calculate how many more posts we need
                needed_posts = sample_size - len(existing_ids[subreddit_name])
                
                if needed_posts <= 0:
                    print(f"Already have enough posts for r/{subreddit_name}, skipping")
                    # Add existing posts to output
                    if existing_data is not None:
                        sub_data = existing_data[existing_data['subreddit'] == subreddit_name]
                        all_submissions.extend(sub_data.to_dict('records'))
                    continue
                
                print(f"Need to collect {needed_posts} more posts for r/{subreddit_name}")
                
                subreddit = self.reddit.subreddit(subreddit_name)
                sub_samples = set()  # Using set to avoid duplicates
                
                for sort in sorts:
                    if len(sub_samples) >= needed_posts:
                        break
                        
                    for time_filter in time_filters:
                        if len(sub_samples) >= needed_posts:
                            break
                            
                        try:
                            # Different handling based on sort method
                            if sort == 'new':
                                submissions = subreddit.new(limit=1000)
                            elif sort == 'hot':
                                submissions = subreddit.hot(limit=1000)
                            else:
                                # Make sure the method exists before trying to use it
                                method = getattr(subreddit, sort, None)
                                if method is None:
                                    print(f"Warning: Method '{sort}' not found on subreddit object")
                                    continue
                                    
                                # Use before_id parameter to get older posts if available in existing_ids
                                if existing_ids[subreddit_name] and len(existing_ids[subreddit_name]) > 0:
                                    # Get potential IDs from existing data, filtering out ones we've already tried
                                    candidate_ids = [id for id in existing_ids[subreddit_name] 
                                                   if id not in used_before_ids[subreddit_name]]
                                    
                                    # If we have no unused IDs, reset and try again
                                    if not candidate_ids:
                                        print(f"Resetting used before IDs for {subreddit_name}")
                                        used_before_ids[subreddit_name].clear()
                                        candidate_ids = list(existing_ids[subreddit_name])
                                    
                                    # Select a random ID to use as the "before" parameter
                                    random_id = random.choice(candidate_ids)
                                    used_before_ids[subreddit_name].add(random_id)
                                    
                                    try:
                                        # For new posts, we use "before" parameter
                                        if sort == 'new':
                                            submissions = subreddit.new(limit=1000, before=f"t3_{random_id}")
                                        # For other sorting methods, we try using "before"
                                        else:
                                            submissions = method(time_filter=time_filter, limit=1000, params={"before": f"t3_{random_id}"})
                                        
                                        print(f"Using before_id={random_id} to get older posts for {sort}/{time_filter}")
                                    except Exception as e:
                                        print(f"Error using before parameter: {str(e)}")
                                        # Fall back to standard method
                                        submissions = method(time_filter=time_filter, limit=1000)
                                else:
                                    submissions = method(time_filter=time_filter, limit=1000)
                            
                            for submission in submissions:
                                                # Skip if we already have this post
                                if submission.id in existing_ids[subreddit_name]:
                                    # Print debug info occasionally 
                                    if random.random() < 0.01:  # 1% chance to print
                                        print(f"Skipping already seen post {submission.id}")
                                    continue
                                    
                                if len(sub_samples) >= needed_posts:
                                    break
                                    
                                if submission_type != "all":
                                    if submission_type == "self" and not submission.is_self:
                                        continue
                                    if submission_type == "link" and submission.is_self:
                                        continue
                                
                                # Create dictionary with submission data
                                sub_dict = {
                                    'subreddit': subreddit_name,
                                    'id': submission.id,
                                    'title': submission.title,
                                    'author': str(submission.author),
                                    'created_utc': dt.fromtimestamp(submission.created_utc),
                                    'score': submission.score,
                                    'upvote_ratio': submission.upvote_ratio,
                                    'num_comments': submission.num_comments,
                                    'url': submission.url,
                                    'is_self': submission.is_self,
                                    'selftext': submission.selftext if submission.is_self else None,
                                    'sort_method': sort,
                                    'time_filter': time_filter if sort != 'new' and sort != 'hot' else None
                                }
                                
                                # Add to our collection and track the ID
                                sub_dict_tuple = tuple(sorted(sub_dict.items()))
                                sub_samples.add(sub_dict_tuple)
                                existing_ids[subreddit_name].add(submission.id)
                                
                                time.sleep(sleep_amount)
                                
                            print(f"Collected {len(sub_samples)} samples from r/{subreddit_name} ({sort}/{time_filter})")
                            time.sleep(1)
                            
                        except Exception as e:
                            print(f"Error with {sort}/{time_filter}: {str(e)}")
                            continue
                
                # Convert back to list of dicts
                sub_samples_list = [dict(items) for items in sub_samples]
                
                # Add to all submissions
                all_submissions.extend(sub_samples_list)
                
                # Also add existing submissions for this subreddit if provided
                if existing_data is not None:
                    sub_existing = existing_data[existing_data['subreddit'] == subreddit_name]
                    all_submissions.extend(sub_existing.to_dict('records'))
                
                # If we couldn't get any new samples but we need more, try a different approach
                if len(sub_samples) == 0 and needed_posts > 0:
                    print(f"Couldn't get any new posts for {subreddit_name}, trying direct PRAW approach...")
                    
                    # Try getting posts with specific PRAW parameters
                    try:
                        # This approach gets posts directly by time period (newest to oldest)
                        for days_ago in range(30, 365, 30):  # Try different time periods
                            if len(sub_samples) >= needed_posts:
                                break
                                
                            end_time = dt.now().timestamp() - (days_ago * 24 * 60 * 60)
                            start_time = end_time - (30 * 24 * 60 * 60)
                            
                            print(f"Trying time period: {days_ago-30} to {days_ago} days ago")
                            
                            # Use PRAW search to find posts in that time period
                            search_results = subreddit.search(
                                "timestamp:{}..{}".format(int(start_time), int(end_time)),
                                sort="new", 
                                limit=200
                            )
                            
                            for submission in search_results:
                                if submission.id in existing_ids[subreddit_name]:
                                    continue
                                    
                                if len(sub_samples) >= needed_posts:
                                    break
                                
                                if submission_type != "all":
                                    if submission_type == "self" and not submission.is_self:
                                        continue
                                    if submission_type == "link" and submission.is_self:
                                        continue
                                
                                # Create dictionary with submission data
                                sub_dict = {
                                    'subreddit': subreddit_name,
                                    'id': submission.id,
                                    'title': submission.title,
                                    'author': str(submission.author),
                                    'created_utc': dt.fromtimestamp(submission.created_utc),
                                    'score': submission.score,
                                    'upvote_ratio': submission.upvote_ratio,
                                    'num_comments': submission.num_comments,
                                    'url': submission.url,
                                    'is_self': submission.is_self,
                                    'selftext': submission.selftext if submission.is_self else None,
                                    'sort_method': 'search',
                                    'time_filter': f"{days_ago-30}-{days_ago}_days_ago"
                                }
                                
                                # Add to our collection and track the ID
                                sub_dict_tuple = tuple(sorted(sub_dict.items()))
                                sub_samples.add(sub_dict_tuple)
                                existing_ids[subreddit_name].add(submission.id)
                                
                                time.sleep(sleep_amount)
                            
                            print(f"Found {len(sub_samples)} total samples after search")
                            time.sleep(1)
                    
                    except Exception as e:
                        print(f"Error with search approach: {str(e)}")
                    
                    # Convert back to list of dicts and add to all submissions
                    additional_samples = [dict(items) for items in sub_samples]
                    all_submissions.extend(additional_samples)
                
            except Exception as e:
                print(f"Error collecting from r/{subreddit_name}: {str(e)}")
                continue
        
        # Return as dataframe and deduplicate by ID
        result_df = pd.DataFrame(all_submissions)
        if not result_df.empty:
            result_df = result_df.drop_duplicates(subset=['id'])
            
            # Print final stats
            for subreddit in subreddits:
                sub_count = len(result_df[result_df['subreddit'] == subreddit])
                print(f"Final count for r/{subreddit}: {sub_count} posts")
                
            return result_df
        else:
            return pd.DataFrame()


In [None]:



import api_keys

# You'll need to get these from your Reddit API application
CLIENT_ID = api_keys.reddit_client_id
CLIENT_SECRET = api_keys.reddit_secret
USER_AGENT = f"script:data_sampler:v1.0 (by /u/{api_keys.reddit_username})"

sampler = RedditSampler(CLIENT_ID, CLIENT_SECRET, USER_AGENT)

# List of subreddits
subreddits_subset = list(subreddits.values())[2:]
sample_size = 6000

# Load existing data if available
try:
    existing_df = pd.read_csv("existing_reddit_data.csv")
    print(f"Loaded existing data with {len(existing_df)} rows")
except FileNotFoundError:
    existing_df = None
    print("No existing data found, starting fresh")

# Get samples with existing data to avoid duplicates
samples_df = sampler.get_samples(
    subreddits=subreddits_subset,
    sample_size=sample_size,
    submission_type="all",
    existing_data=df3.copy()
)

In [66]:
# concat dfs

df4 = pd.concat([df3, samples_df_new], ignore_index=True)
df4.drop_duplicates(subset=['id'], inplace=True)
df4['subreddit'].value_counts()

subreddit
SuicideWatch           6164
relationship_advice    6059
depression             5881
selfharm               5110
GriefSupport           4800
lonely                 4800
Anxiety                4800
addiction              4800
sexualassault          4696
AskLGBT                4214
EatingDisorders        4013
abusesurvivors         3159
bullying               3066
Name: count, dtype: int64

In [None]:
df = df4.copy()


def clean_reddit(df):

    # df['title'] = df['title'].str.replace('nan', '', regex=False)
    
    df.loc[df['selftext'] == 'None', 'selftext'] = ''
    df.loc[df['selftext'] == 'nan', 'selftext'] = ''
    
    df['title_text'] = df['title'].astype(str)+" \n---\n "+df['selftext'].astype(str)
    df['title_text'] = df['title_text'].str.replace(' \n---\n nan', '')
    df['title_text'] = df['title_text'].str.replace(' \n---\n None', '')
    df['title_text'] = df['title_text'].str.replace('None \n---\n ', '')
    display(df[df['title_text'].str.contains(' \n---\n nan')][['title', 'selftext', 'title_text']])

    # - Make sure title_text was combined correctly (it seems if body was NaN then the combination is NaN, which is incorrect)

    display(df[df['title_text'].str.contains('nan ')][['title', 'selftext', 'title_text']])
    display(df[df['title_text'].str.contains(' nan')][['title', 'selftext', 'title_text']])

    # remove extremely short samples

    from construct_tracker.utils import word_count

    df['word_count'] = word_count.word_count(df['title_text'].to_list())

    df = df[df['word_count']>=5]
    df = df[df['title']!='[ Removed by Reddit ]']
    df = df.drop_duplicates(subset=['title_text'])
    return df

df = clean_reddit(df)


In [79]:
df['subreddit'].value_counts()

subreddit
SuicideWatch           6164
relationship_advice    6059
depression             5881
selfharm               5110
lonely                 4795
Anxiety                4792
addiction              4728
sexualassault          4692
GriefSupport           4658
AskLGBT                4204
EatingDisorders        4007
abusesurvivors         3127
bullying               2994
Name: count, dtype: int64

In [72]:
# Save the results
import datetime

current_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_csv(f"data/input/updated_reddit_data_{current_timestamp}.csv", index=False)
# print(f"Saved {len(samples_df_new)} posts to updated_reddit_data.csv")

In [82]:
sampler = RedditSampler(CLIENT_ID, CLIENT_SECRET, USER_AGENT)

# List of subreddits
subreddits_to_merge = {
    "workplace_bullying": "bullying",
    "domesticviolence": "abusesurvivors"
}

sample_size = 1800

# Load existing data if available

# Get samples with existing data to avoid duplicates
samples_df2 = sampler.get_samples(
    subreddits=list(subreddits_to_merge.keys()),
    sample_size=sample_size,
    submission_type="all",
    existing_data=None
)


Need to collect 1800 more posts for r/workplace_bullying
Collected 1000 samples from r/workplace_bullying (top/all)
Using before_id=1dmx96m to get older posts for top/year
Skipping already seen post 1h7k5ou
Collected 1000 samples from r/workplace_bullying (top/year)
Using before_id=1ii9bov to get older posts for top/month
Collected 1000 samples from r/workplace_bullying (top/month)
Using before_id=1haean3 to get older posts for top/week
Collected 1000 samples from r/workplace_bullying (top/week)
Using before_id=1ggbh8u to get older posts for top/day
Collected 1000 samples from r/workplace_bullying (top/day)
Using before_id=1hdctsx to get older posts for controversial/all
Skipping already seen post 1dgz7s8
Collected 1067 samples from r/workplace_bullying (controversial/all)
Using before_id=1iivr9s to get older posts for controversial/year
Collected 1067 samples from r/workplace_bullying (controversial/year)
Using before_id=1h86uj1 to get older posts for controversial/month
Collected 106

In [83]:
sampler = RedditSampler(CLIENT_ID, CLIENT_SECRET, USER_AGENT)

# List of subreddits
subreddits_to_merge = {
    "asktransgender": "AskLGBT",
    "EDAnonymous": "EatingDisorders",
}

sample_size = 800

# Load existing data if available

# Get samples with existing data to avoid duplicates
samples_df3 = sampler.get_samples(
    subreddits=list(subreddits_to_merge.keys()),
    sample_size=sample_size,
    submission_type="all",
    existing_data=None
)
    


Need to collect 800 more posts for r/asktransgender
Collected 800 samples from r/asktransgender (top/all)
Need to collect 800 more posts for r/EDAnonymous
Collected 800 samples from r/EDAnonymous (top/all)
Final count for r/asktransgender: 800 posts
Final count for r/EDAnonymous: 800 posts


In [86]:
# concat DFs

df5 = pd.concat([df, samples_df2, samples_df3], ignore_index=True)
df5 = clean_reddit(df5)



Unnamed: 0,title,selftext,title_text


Unnamed: 0,title,selftext,title_text
14086,I lost my mum and the. My nan 20 days later an...,"the title is as is, I (F20) feel stupid becaus...",I lost my mum and the. My nan 20 days later an...
58775,"Mother’s Day approaching, me and my nan are mi...","So it’s my first Mother’s Day without my mum, ...","Mother’s Day approaching, me and my nan are mi..."


Unnamed: 0,title,selftext,title_text
2415,Circa summer 2016 I overheard a young nanny te...,,Circa summer 2016 I overheard a young nanny te...
3155,I was SA'd by my nanny as a child,\nNeed advice/Rant! \n\nI don't know why I'm p...,I was SA'd by my nanny as a child \n---\n \nNe...
4606,I am a nanny and need advice on an abusive fam...,"Hey everyone, I‘ve been nying for a family for...",I am a nanny and need advice on an abusive fam...
14086,I lost my mum and the. My nan 20 days later an...,"the title is as is, I (F20) feel stupid becaus...",I lost my mum and the. My nan 20 days later an...
14277,my beautiful nana passed away tonight,,my beautiful nana passed away tonight
14645,My nana passed and I don’t feel supported by t...,So my a passed a few weeks ago and it was a to...,My nana passed and I don’t feel supported by t...
16171,"My daughter was with our nanny and choked, and...",I’m so sad everyday since this has happened. I...,"My daughter was with our nanny and choked, and..."
58775,"Mother’s Day approaching, me and my nan are mi...","So it’s my first Mother’s Day without my mum, ...","Mother’s Day approaching, me and my nan are mi..."


In [90]:
sampler = RedditSampler(CLIENT_ID, CLIENT_SECRET, USER_AGENT)

# List of subreddits
subreddits_to_merge = {
    "cyberbullying": "bullying",
}

sample_size = 600

# Load existing data if available

# Get samples with existing data to avoid duplicates
samples_df3 = sampler.get_samples(
    subreddits=list(subreddits_to_merge.keys()),
    sample_size=sample_size,
    submission_type="all",
    existing_data=None
)
    


Need to collect 600 more posts for r/cyberbullying
Collected 600 samples from r/cyberbullying (top/all)
Final count for r/cyberbullying: 600 posts


In [91]:
df6 = pd.concat([df5, samples_df3], ignore_index=True)
df6 = clean_reddit(df6)

Unnamed: 0,title,selftext,title_text


Unnamed: 0,title,selftext,title_text
14086,I lost my mum and the. My nan 20 days later an...,"the title is as is, I (F20) feel stupid becaus...",I lost my mum and the. My nan 20 days later an...
58775,"Mother’s Day approaching, me and my nan are mi...","So it’s my first Mother’s Day without my mum, ...","Mother’s Day approaching, me and my nan are mi..."


Unnamed: 0,title,selftext,title_text
2415,Circa summer 2016 I overheard a young nanny te...,,Circa summer 2016 I overheard a young nanny te...
3155,I was SA'd by my nanny as a child,\nNeed advice/Rant! \n\nI don't know why I'm p...,I was SA'd by my nanny as a child \n---\n \nNe...
4606,I am a nanny and need advice on an abusive fam...,"Hey everyone, I‘ve been nying for a family for...",I am a nanny and need advice on an abusive fam...
14086,I lost my mum and the. My nan 20 days later an...,"the title is as is, I (F20) feel stupid becaus...",I lost my mum and the. My nan 20 days later an...
14277,my beautiful nana passed away tonight,,my beautiful nana passed away tonight
14645,My nana passed and I don’t feel supported by t...,So my a passed a few weeks ago and it was a to...,My nana passed and I don’t feel supported by t...
16171,"My daughter was with our nanny and choked, and...",I’m so sad everyday since this has happened. I...,"My daughter was with our nanny and choked, and..."
58775,"Mother’s Day approaching, me and my nan are mi...","So it’s my first Mother’s Day without my mum, ...","Mother’s Day approaching, me and my nan are mi..."


In [93]:
# replace subreddit names using the dictionary, where values should be replaced by their keys
# List of subreddits
subreddits_to_merge = {
    "workplace_bullying": "bullying",
    "domesticviolence": "abusesurvivors",
    'asktransgender': 'AskLGBT', 
    'EDAnonymous': 'EatingDisorders',
    "cyberbullying": "bullying",
}
df6['subreddit'] = df6['subreddit'].replace(subreddits_to_merge)

df6['subreddit'].value_counts()

subreddit
SuicideWatch           6164
relationship_advice    6059
depression             5881
selfharm               5110
AskLGBT                5004
bullying               4984
abusesurvivors         4920
EatingDisorders        4802
lonely                 4795
Anxiety                4792
addiction              4728
sexualassault          4692
GriefSupport           4658
Name: count, dtype: int64

In [104]:
df6 = clean_reddit(df6)

Unnamed: 0,title,selftext,title_text


Unnamed: 0,title,selftext,title_text
14086,I lost my mum and the. My nan 20 days later an...,"the title is as is, I (F20) feel stupid becaus...",I lost my mum and the. My nan 20 days later an...
58775,"Mother’s Day approaching, me and my nan are mi...","So it’s my first Mother’s Day without my mum, ...","Mother’s Day approaching, me and my nan are mi..."


Unnamed: 0,title,selftext,title_text
2415,Circa summer 2016 I overheard a young nanny te...,,Circa summer 2016 I overheard a young nanny te...
3155,I was SA'd by my nanny as a child,\nNeed advice/Rant! \n\nI don't know why I'm p...,I was SA'd by my nanny as a child \n---\n \nNe...
4606,I am a nanny and need advice on an abusive fam...,"Hey everyone, I‘ve been nying for a family for...",I am a nanny and need advice on an abusive fam...
14086,I lost my mum and the. My nan 20 days later an...,"the title is as is, I (F20) feel stupid becaus...",I lost my mum and the. My nan 20 days later an...
14277,my beautiful nana passed away tonight,,my beautiful nana passed away tonight
14645,My nana passed and I don’t feel supported by t...,So my a passed a few weeks ago and it was a to...,My nana passed and I don’t feel supported by t...
16171,"My daughter was with our nanny and choked, and...",I’m so sad everyday since this has happened. I...,"My daughter was with our nanny and choked, and..."
58775,"Mother’s Day approaching, me and my nan are mi...","So it’s my first Mother’s Day without my mum, ...","Mother’s Day approaching, me and my nan are mi..."


In [105]:
df6[df6['title_text'].str.contains(' \n---\n None')][['title', 'selftext', 'title_text']]

Unnamed: 0,title,selftext,title_text


In [None]:
df6

In [109]:
df6['subreddit'].value_counts()

subreddit
SuicideWatch           6164
relationship_advice    6059
depression             5881
selfharm               5110
AskLGBT                5004
bullying               4936
abusesurvivors         4907
lonely                 4795
EatingDisorders        4795
Anxiety                4788
addiction              4705
sexualassault          4691
GriefSupport           4564
Name: count, dtype: int64

In [110]:
sampler = RedditSampler(CLIENT_ID, CLIENT_SECRET, USER_AGENT)

# List of subreddits
subreddits_to_merge = {
    "grief": "bullying",
}

sample_size = 300

# Load existing data if available

# Get samples with existing data to avoid duplicates
samples_df4 = sampler.get_samples(
    subreddits=list(subreddits_to_merge.keys()),
    sample_size=sample_size,
    submission_type="all",
    existing_data=None
)
    
df6 = pd.concat([df6, samples_df4], ignore_index=True)
df6 = clean_reddit(df6)

# replace subreddit names using the dictionary, where values should be replaced by their keys
# List of subreddits
subreddits_to_merge = {
    "workplace_bullying": "bullying",
    "domesticviolence": "abusesurvivors",
    'asktransgender': 'AskLGBT', 
    'EDAnonymous': 'EatingDisorders',
    "cyberbullying": "bullying",
}
df6['subreddit'] = df6['subreddit'].replace(subreddits_to_merge)

df6['subreddit'].value_counts()

Need to collect 300 more posts for r/grief
Collected 300 samples from r/grief (top/all)
Final count for r/grief: 300 posts


Unnamed: 0,title,selftext,title_text


Unnamed: 0,title,selftext,title_text
14086,I lost my mum and the. My nan 20 days later an...,"the title is as is, I (F20) feel stupid becaus...",I lost my mum and the. My nan 20 days later an...
58686,"Mother’s Day approaching, me and my nan are mi...","So it’s my first Mother’s Day without my mum, ...","Mother’s Day approaching, me and my nan are mi..."


Unnamed: 0,title,selftext,title_text
2415,Circa summer 2016 I overheard a young nanny te...,,Circa summer 2016 I overheard a young nanny te...
3155,I was SA'd by my nanny as a child,\nNeed advice/Rant! \n\nI don't know why I'm p...,I was SA'd by my nanny as a child \n---\n \nNe...
4606,I am a nanny and need advice on an abusive fam...,"Hey everyone, I‘ve been nying for a family for...",I am a nanny and need advice on an abusive fam...
14086,I lost my mum and the. My nan 20 days later an...,"the title is as is, I (F20) feel stupid becaus...",I lost my mum and the. My nan 20 days later an...
14277,my beautiful nana passed away tonight,,my beautiful nana passed away tonight
14645,My nana passed and I don’t feel supported by t...,So my a passed a few weeks ago and it was a to...,My nana passed and I don’t feel supported by t...
16171,"My daughter was with our nanny and choked, and...",I’m so sad everyday since this has happened. I...,"My daughter was with our nanny and choked, and..."
58686,"Mother’s Day approaching, me and my nan are mi...","So it’s my first Mother’s Day without my mum, ...","Mother’s Day approaching, me and my nan are mi..."
66617,my bird died last night and i didn’t get to sa...,,my bird died last night and i didn’t get to sa...


subreddit
SuicideWatch           6164
relationship_advice    6059
depression             5881
selfharm               5110
AskLGBT                5004
bullying               4936
abusesurvivors         4907
lonely                 4795
EatingDisorders        4795
Anxiety                4788
addiction              4705
sexualassault          4691
GriefSupport           4564
grief                   277
Name: count, dtype: int64

In [None]:
df6['subtype'].value_counts()

In [111]:
subreddits_to_merge = {
    "workplace_bullying": "bullying",
    "domesticviolence": "abusesurvivors",
    'asktransgender': 'AskLGBT', 
    'EDAnonymous': 'EatingDisorders',
    "cyberbullying": "bullying",
    "grief": "GriefSupport"
}
df6['subreddit'] = df6['subreddit'].replace(subreddits_to_merge)

df6['subreddit'].value_counts()

subreddit
SuicideWatch           6164
relationship_advice    6059
depression             5881
selfharm               5110
AskLGBT                5004
bullying               4936
abusesurvivors         4907
GriefSupport           4841
lonely                 4795
EatingDisorders        4795
Anxiety                4788
addiction              4705
sexualassault          4691
Name: count, dtype: int64

In [113]:
df6.reset_index(drop=True, inplace=True)


# current_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

df6.to_csv(f'data/input/reddit_13_mental_health_4691-6164-posts_{current_timestamp}.csv', index=False)

# Remove some duplicate documents

In [10]:
import pandas as pd
current_timestamp = '20250311_123431'
results_df = pd.read_csv(f'data/input/reddit_13_mental_health_4691-6164-posts_{current_timestamp}.csv')
results_df['subreddit'].value_counts()


subreddit
SuicideWatch           6164
relationship_advice    6059
depression             5881
selfharm               5110
AskLGBT                5004
bullying               4936
abusesurvivors         4907
GriefSupport           4841
lonely                 4795
EatingDisorders        4795
Anxiety                4788
addiction              4705
sexualassault          4691
Name: count, dtype: int64

In [12]:
results_df.drop_duplicates(subset=['title_text'], inplace=True)
results_df = results_df[results_df['title']!='[ Removed by Reddit ]']
results_df['subreddit'].value_counts()

subreddit
SuicideWatch           6130
relationship_advice    6057
depression             5862
selfharm               5105
AskLGBT                4998
bullying               4923
abusesurvivors         4902
GriefSupport           4830
EatingDisorders        4789
lonely                 4787
Anxiety                4785
addiction              4701
sexualassault          4677
Name: count, dtype: int64

In [14]:
results_df.to_csv(f'data/input/reddit_13_mental_health_4677-6130-posts_{current_timestamp}.csv', index=False)

# make sample of 4600

In [15]:
# subsample to 2800 samples per subreddit

df = results_df.copy()

n = 4600

# Create an empty dataframe to store the result
result_df = pd.DataFrame()

# Get value counts of subreddits
subreddit_counts = df['subreddit'].value_counts()

# For each subreddit
for subreddit, count in subreddit_counts.items():
    # Get all rows for this subreddit
    temp_df = df[df['subreddit'] == subreddit]
    
    # If count exceeds 6000, randomly sample 6000 rows
    if count > n:
        temp_df = temp_df.sample(n, random_state=42)
    
    # Add to the result dataframe
    result_df = pd.concat([result_df, temp_df])

# Reset index for the final result
result_df = result_df.reset_index(drop=True)
result_df['subreddit'].value_counts()

subreddit
SuicideWatch           4600
relationship_advice    4600
depression             4600
selfharm               4600
AskLGBT                4600
bullying               4600
abusesurvivors         4600
GriefSupport           4600
EatingDisorders        4600
lonely                 4600
Anxiety                4600
addiction              4600
sexualassault          4600
Name: count, dtype: int64

In [16]:
result_df.to_csv(f'data/input/reddit_13_mental_health_4600-posts_{current_timestamp}.csv', index=False)

# Train-test split

In [17]:
filename = 'reddit_13_mental_health_4600-posts_20250311_123431'
df = pd.read_csv(f'./data/input/{filename}.csv')
df

Unnamed: 0,subreddit,id,title,author,created_utc,score,upvote_ratio,num_comments,url,is_self,selftext,sort_method,time_filter,title_text,word_count
0,SuicideWatch,1itjza2,I’m going to college,wackybastard,2025-02-19 18:27:11,1,1.00,0,https://www.reddit.com/r/SuicideWatch/comments...,True,I am attending college in the fall. I’m consta...,new,,I’m going to college \n---\n I am attending co...,126
1,SuicideWatch,1e65p1n,I ruined my own life at 18,Broad-Technician-536,2024-07-18 03:40:46,267,0.96,42,https://www.reddit.com/r/SuicideWatch/comments...,True,I don’t expect sympathy( I actually expect cri...,top,year,I ruined my own life at 18 \n---\n I don’t exp...,190
2,SuicideWatch,1is02j9,I'm too broken.,Personal_Library_121,2025-02-17 19:51:47,1,1.00,0,https://www.reddit.com/r/SuicideWatch/comments...,True,My brain is broken and always will be. Even wi...,new,,I'm too broken. \n---\n My brain is broken and...,84
3,SuicideWatch,1ipvc6o,My boyfriend cheated on me,Nervous-Reindeer-327,2025-02-15 01:19:49,11,0.87,4,https://www.reddit.com/r/SuicideWatch/comments...,True,5 years. No family or friends \nQuick and pain...,top,month,My boyfriend cheated on me \n---\n 5 years. No...,13
4,SuicideWatch,1it7dem,I hate that I’m back here. I thought I was bet...,WeeklyFurball,2025-02-19 09:56:50,4,0.84,1,https://www.reddit.com/r/SuicideWatch/comments...,True,It has been about 3 years since I managed to g...,new,,I hate that I’m back here. I thought I was bet...,136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59795,sexualassault,1iwt3gu,Scared to leave the house,Silver_Half8669,2025-02-23 22:57:07,2,1.00,1,https://www.reddit.com/r/sexualassault/comment...,True,Ever since my ex and I broke up four months ag...,top,month,Scared to leave the house \n---\n Ever since m...,230
59796,sexualassault,1cevj8i,Help! My son’s 15 year old male friend is bein...,Critical_Valuable_92,2024-04-27 22:19:43,18,0.96,20,https://www.reddit.com/r/sexualassault/comment...,True,Things my sons have claimed this kid has done:...,top,year,Help! My son’s 15 year old male friend is bein...,298
59797,sexualassault,1afl6ii,My uncle touched me,,2024-01-31 11:01:45,47,0.96,12,https://www.reddit.com/r/sexualassault/comment...,True,On Sunday there was a family gathering. A part...,top,all,My uncle touched me \n---\n On Sunday there wa...,113
59798,sexualassault,1bz0m6y,Noticed something at the library,FunPrimary2252,2024-04-08 11:22:31,0,0.50,2,https://www.reddit.com/r/sexualassault/comment...,True,Me and another guy can't even fucking use the ...,controversial,all,Noticed something at the library \n---\n Me a...,78


In [20]:
filename

'reddit_13_mental_health_4600-posts_20250311_123431'

In [22]:


# For every subreddit, split into train (4000 posts) and test sets (600 posts)
train_df = pd.DataFrame()
test_df = pd.DataFrame()

for subreddit in df['subreddit'].unique():
    subreddit_df = df[df['subreddit'] == subreddit]
    # Shuffle rows
    subreddit_df = subreddit_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Use concat instead of deprecated append
    train_df = pd.concat([train_df, subreddit_df[:4000]], ignore_index=True)
    test_df = pd.concat([test_df, subreddit_df[4000:4600]], ignore_index=True)

# No need to reset_index again as ignore_index=True in concat already handles this
train_df.to_csv(f'./data/input/{filename}_train.csv', index=False)
test_df.to_csv(f'./data/input/{filename}_test.csv', index=False)
train_df_ids = train_df['id'].values
test_df_ids = test_df['id'].values

# Save as variable in a py script
with open(f'./data/input/{filename.replace("-", "_")}_train_test_ids.py', 'w') as f:
    f.write(f'train_ids = {train_df_ids.tolist()}')
    f.write('\n')
    f.write(f'test_ids = {test_df_ids.tolist()}')

df[df['id'].isin(test_df_ids)]['subreddit'].value_counts()
display(train_df['subreddit'].value_counts())
display(test_df['subreddit'].value_counts())

subreddit
SuicideWatch           4000
relationship_advice    4000
depression             4000
selfharm               4000
AskLGBT                4000
bullying               4000
abusesurvivors         4000
GriefSupport           4000
EatingDisorders        4000
lonely                 4000
Anxiety                4000
addiction              4000
sexualassault          4000
Name: count, dtype: int64

subreddit
SuicideWatch           600
relationship_advice    600
depression             600
selfharm               600
AskLGBT                600
bullying               600
abusesurvivors         600
GriefSupport           600
EatingDisorders        600
lonely                 600
Anxiety                600
addiction              600
sexualassault          600
Name: count, dtype: int64

In [27]:
[print(n, '\n') for n in test_df.sample(n=20)['title_text'].to_list()]

Everything feels like a chore.  
---
 Everything feels like a chore. I spent more than 20 minutes in bathroom for just brushing my teeth and washing my fucking face. 

I knew that all I had to do was just putting some stupid fucking toothpaste on the toothbrush but that alone took more than five minutes to do. I just stared at it, trying to process what I had to do step by step. 

And it happens every fucking morning. I can’t even find the energy to brush my fucking teeth how pathetic can one get more?  

I decided to tell people "I am not okay" when close friends ask me "how are you? Here is how it went... 
---
 Every single one of them left me on read. 

I am really not doing well isolated in quarantine and I get text conversations are the lowest form of communication. but it's all we got in these times. (I am in Michigan with the highest current covid cases in the US, and seeing people isn't really an option).

I even texted my friend (who I was just Bestman for in October) that I a

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]