# Download r/SuicideWatch Reddit submissions, comments, and prior and future submissions 

* Download all SW submissions
* Take a random subsample
* Find all other submissions from same users
* Take a N months from SITBI submission to last submission
* Label whether next submissions contain a SW submission or not



In [None]:
import os
import sys
import time
import random
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pprint import pprint
import requests
# from langdetect import detect
from datetime import datetime, date, timedelta, timezone
import time
import string
from scipy import stats
# import config #this is private and contains client info for Reddit pushshift API 

In [None]:
# # Run this cell to be able to mount GDrive and attach it to the colab so that we can save json outputs
# from google.colab import drive
# drive.mount('/content/drive')
# # output dir if drive has been mounted:
# output_dir = '/content/drive/My Drive/ML4HC_Final_Project/data/input/raw_submission/'

In [None]:
input_dir = './data/input/'
output_dir = './data/output/'

In [None]:
# Subreddits to scrape:

subreddits = ['suicidewatch']

metadata_to_keep_submissions = [
    'id', #submission id                
    'author',
    'created_utc', #when submission was created
    'subreddit', #subreddit
    'title',
    'selftext', #body
    'score',
    'num_comments',
    'permalink', #need to add https://www.reddit.com
]


metadata_to_keep_comments =[
    'id', 
    'author', 
    'created_utc', 
    'subreddit', 
    'body', #selftext
    'score',
    'is_submitter',
    'link_id', 
    'parent_id',  
    ]


In [None]:
# Cell that contains helpful functions for datetime operations and parsing the json output from the url query
import datetime as dt

def gen_timestamp():
    timestamp = '{:%Y-%m-%d-%H-%M-%S}'.format(dt.datetime.now())
    return timestamp



def date2timestamp(date):
    '''
    "01/12/2011"
    '''
    return int(time.mktime(datetime.strptime(date, "%Y/%m/%d").timetuple()))

def timestamp2date(timestamp):
    return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d-%H-%M-%S')


def next_day(date):
  tmrw = datetime.strptime(date, "%Y/%m/%d") + timedelta(days=1)
  return tmrw.strftime('%Y/%m/%d')

def list_of_days(start_date, end_date):
    '''
    start_date = date(2019, 1, 19)   # start date
    end_date = date(2019, 3, 22)   # end date
    '''
    delta = end_date - start_date       # as timedelta
    days = []
    for i in range(delta.days + 1):
        day = start_date + timedelta(days=i)
        days.append(str(day).replace('-', '/'))
    return days



In [None]:


def make_request(url):
    request = requests.get(url,headers = {'User-agent': config.user_agent})
    posts = request.json()
    return posts

def search_iteratively(submission_or_comment = 'submission',author_username = 'USERNAME', earliest_date=1119672000):
    '''
    Based on: https://github.com/bilsun/reddit-scraper
    earliest_date = 1119672000 # 6/25/05 @ 12am | when Reddit was founded |https://www.unixtimestamp.com/index.php
    sort = 'desc' #'asc' results in "Too many requests" error
    
    '''
    sort = 'desc'
    
    
    submission_fields = 'id,score,full_link,subreddit,title,selftext,created_utc,author,num_comments' 
    url = f"https://api.pushshift.io/reddit/search/{submission_or_comment}/?author={author_username}&fields={submission_fields}&after={earliest_date}&size=1000&sort={sort}&metadata=true"
    
    # One could add other attributes:
    # keywords = 'bias|prejudice'
    # subreddits = 'AskSocialScience,AskFeminists' 
    # url = f"https://api.pushshift.io/reddit/search/submission/?q={keywords}&subreddit={subreddits}&fields={submission_fields}&after={earliest_date}&size=1000&sort=desc&metadata=true"
    

    # paginating results (collect 1000 posts at a time to work around Pushshift's size limit)
    start_from = ''
    data = []
    could_not_download = []
    
    while True:
        try: 
            posts = make_request(url+start_from)
    #         print("keywords: " + keywords + " | " + str(posts['metadata']['total_results']) + " posts found")
        except:
            try: 
                n=0.5
                time.sleep(n)
                posts = make_request(url+start_from)
                print(f'sleeping {n} sec')
            except:
                try: 
                    n = 1
                    time.sleep(n)
                    posts = make_request(url+start_from)
                    print(f'sleeping {n} sec')
                except:
                    try:
                        n = 5
                        time.sleep(n)
                        posts = make_request(url+start_from)
                        print(f'sleeping {n} sec')
                    except:
                        print(f'-------could not download {url}')
                        could_not_download.append(url)
        # make sure Pushshift is gathering all Reddit data (IMPORTANT IF SCRAPING FOR RESEARCH)
        total_posts = posts['metadata']['shards']["total"]
        assert(posts['metadata']['shards']["successful"]==total_posts) 
        

        data.extend(posts["data"])
        if len(posts["data"]) == 0:
                break # stop collecting data once there's nothing left to collect

        last_utc = data[-1]['created_utc']
        start_from = '&before=' + str(last_utc)

#     print("successful data collection!\n")
    df = pd.DataFrame(data)
    if not df.empty:    
        df = df.sort_values(by='created_utc').reset_index(drop=True) #from oldest to newest
    return df, could_not_download




In [None]:
def url_to_json(url):
    # parse request      
    could_not_download = []
    try:
        result = requests.get(url)
        result = result.json()

    except:
        try: 
            n=0.5
            time.sleep(n)
            result = requests.get(url)
            result = result.json()
            print(f'sleeping {n} sec')
        except:
            try: 
                n=1
                time.sleep(n)
                result = requests.get(url)
                result = result.json()
                print(f'sleeping {n} sec')
            except:
                try: 
                    n=5
                    time.sleep(n)
                    result = requests.get(url)
                    result = result.json()
                    print(f'sleeping {n} sec')
                except:
                    print('failed!')
                    could_not_download.append(url)
                    return pd.DataFrame(could_not_download)
    result = result['data']
    df = pd.DataFrame(result)
    return df

    







In [None]:
def search_by_ids(ids, submission_or_comment='comment'):
    ids = ','.join(ids)
    url = f'https://api.pushshift.io/reddit/search/{submission_or_comment}/?ids={ids}'
    result = url_to_json(url)
    result = pd.DataFrame(result)
    result =result.loc[:,result.columns.isin(metadata_to_keep_comments)] # one of the comments didnt have certain cols
    return result

In [None]:
def search_comments_by_sub_id(submission_id = ''):
    #limited to 1000
    comment_fields = 'id,author,created_utc,subreddit,body,score,is_submitter,link_id,parent_id'
    url = f'https://api.pushshift.io/reddit/comment/search/?link_id={submission_id}&fields={comment_fields}&limit=1000'
#     url = f'https://api.pushshift.io/reddit/comment/search/?link_id={submission_id}&limit=1000'
    comments = url_to_json(url)
    return comments



In [None]:

def search_comments_iteratively(submission_id = '', earliest_date=1119672000):
    '''
    Based on: https://github.com/bilsun/reddit-scraper
    earliest_date = 1119672000 # 6/25/05 @ 12am | when Reddit was founded |https://www.unixtimestamp.com/index.php
    sort = 'desc' #'asc' results in "Too many requests" error
    
    '''
    sort = 'desc'
    submission_fields = 'id,score,full_link,subreddit,title,selftext,created_utc,author,num_comments' 
    url = f"https://api.pushshift.io/reddit/search/comments/?link_id={submission_id}&fields={submission_fields}&after={earliest_date}&size=1000&sort={sort}&metadata=true"
    
    # One could add other attributes:
    # keywords = 'bias|prejudice'
    # subreddits = 'AskSocialScience,AskFeminists' 
    # url = f"https://api.pushshift.io/reddit/search/submission/?q={keywords}&subreddit={subreddits}&fields={submission_fields}&after={earliest_date}&size=1000&sort=desc&metadata=true"
    

    # paginating results (collect 1000 posts at a time to work around Pushshift's size limit)
    start_from = ''
    data = []
    could_not_download = []
    
    while True:
        try: 
            posts = make_request(url+start_from)
    #         print("keywords: " + keywords + " | " + str(posts['metadata']['total_results']) + " posts found")
        except:
            try: 
                n=0.5
                time.sleep(n)
                posts = make_request(url+start_from)
                print(f'sleeping {n} sec')
            except:
                try: 
                    n = 1
                    time.sleep(n)
                    posts = make_request(url+start_from)
                    print(f'sleeping {n} sec')
                except:
                    try:
                        n = 5
                        time.sleep(n)
                        posts = make_request(url+start_from)
                        print(f'sleeping {n} sec')
                    except:
                        print(f'-------could not download {url}')
                        could_not_download.append(url)
        # make sure Pushshift is gathering all Reddit data (IMPORTANT IF SCRAPING FOR RESEARCH)
        total_posts = posts['metadata']['shards']["total"]
        assert(posts['metadata']['shards']["successful"]==total_posts) 
        

        data.extend(posts["data"])
        if len(posts["data"]) == 0:
                break # stop collecting data once there's nothing left to collect

        last_utc = data[-1]['created_utc']
        start_from = '&before=' + str(last_utc)

    df = pd.DataFrame(data)
    if not df.empty:    
        df = df.sort_values(by='created_utc').reset_index(drop=True) #from oldest to newest
    return df, could_not_download


In [None]:
# # Method used to get submissions
# def scrape_reddit(output_dir, subreddit, date_start, date_end, size = 1000):
#     '''
#     size = {1,1000} #amount of submissions
#     '''
#     start = date2timestamp(date_start)
#     end = date2timestamp(date_end)
#     # use the pushshift api to extract out data
#     url = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&sort=desc&sort_type=created_utc&after={start}&before={end}&size={size}'
# #     print(url)
#     try:
#         submissions = requests.get(url)
#         submissions = submissions.json()
#         submissions = submissions['data']
#     except:
#         print('sleeping for 1 seconds...')
#         time.sleep(1)
#         submissions = requests.get(url)
#         submissions = submissions.json()
#         submissions = submissions['data']

#     df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'submission', 'num_comments', 'score'])
    
#     for submission in submissions:
#         if 'selftext' in submission: # check if selftext parameter exists
#             text = submission['selftext']
#             if text != "" and  text != '[removed]' and '[deleted]' not in text: # further check if selftext is not empty
#                 try: 
#                     if detect(text) == 'en': # check if text is in english - if the language detected is not in the langdetect library, then continue to the next submission
#                         df = df.append({'subreddit': subreddit, 
#                                   'author': submission['author'], 
#                                   'date': date_start, 
#                                   'submission': submission['title'] + ' ' + submission['selftext'],
#                                   'num_comments': submission['num_comments'], 
#                                   'score': submission['score'],
#                                  }, ignore_index=True)
#                 except:
#                     continue
#     return df

# Find all submissions

From Reddit Mental Health Dataset (Low et al. (2020). JMIR), we obtained SuicideWatch post's usernames. Then we searched for all their prior and subsequent submissions. We obtained the first SW submission, and obtained the comments for that submission. We also labelled (a) 1 if subsquent subreddits contained SW and (b) whether there were any subreddits contained mental health or other support-seeking subreddits (personalfinance?)




In [None]:
# Obtain SW submissions from Reddit Mental Health Dataset (Low et al, 2020. JMIR)

sw2018 = pd.read_csv(input_dir+'suicidewatch_2018_features_tfidf_256.csv')
sw2019 = pd.read_csv(input_dir+'suicidewatch_2019_features_tfidf_256.csv')
sw = pd.concat([sw2018,sw2019], axis=0)
sw = sw.reset_index(drop=True)
sw.head(5)

In [None]:
authors = sw.author.unique()
print(len(authors), 'unique authors')

In [None]:
run = False 

if run:
    all_submissions = []
    print('total: ', len(authors))
    could_not_download_all = []
    for i, author in enumerate(authors):
        if i%50==0:
            print(i)
        try: 
            all_submissions_i, could_not_download = search_iteratively(submission_or_comment = 'submission',author_username = author, earliest_date=1119672000)
            could_not_download_all.append(could_not_download)
            all_submissions.append(all_submissions_i)
        except:
            print(f'could not download {author}')
            could_not_download_all.append(author)
        if i%1000==0:
            # save every 1000 users in case it fails             
            posts_per_user = [n.shape[0] for n in all_submissions]    
            all_submissions = pd.concat(all_submissions)
            pd.DataFrame(all_submissions).to_csv(input_dir+f'sw_users_all_submissions_{gen_timestamp()}_{i}.csv')        
            could_not_download_all = [n for n in could_not_download_all if len(n)==0]
            pd.DataFrame(could_not_download_all).to_csv(input_dir+f'sw_users_all_submissions_{gen_timestamp()}_{i}_could-not-download.csv')
            all_submissions = []
            could_not_download_all = []





### Compile submissions dfs

In [None]:
files = os.listdir(input_dir)
files = [n for n in files if 'sw_users_all_submissions' in n]
could_not_download = [n for n in files if 'could_not_download' in n]
files =  [n for n in files if 'could_not_download' not in n]
files.sort()

all_submissions = []
for file in files:
    submissions_i = pd.read_csv(input_dir+file, index_col=0)
    all_submissions.append(submissions_i)

    
all_submissions = pd.concat(all_submissions)
all_submissions = all_submissions.reset_index(drop=True)

all_submissions

1307323 rows


885889 rows without nans `all_submissions.dropna()`

In [None]:
subs = all_submissions.dropna().shape[0]
subs

In [None]:
all_submissions.dropna().to_csv(input_dir+f'all_subs_{gen_timestamp()}.csv') # all_subs_2021-06-08-08-20-26.csv

In [None]:
all_submissions = pd.read_csv(input_dir+'all_subs_2021-06-08-08-20-26.csv', index_col = 0)

all_submissions

### Descriptive stats

In [None]:
from collections import Counter
posts_per_user = Counter(all_submissions.author.values)
posts_per_user = pd.DataFrame(posts_per_user, index = ['counts']).T
posts_per_user = posts_per_user.sort_values(by='counts')[::-1]
posts_per_user

In [None]:
posts_per_user

In [None]:
posts_per_user_df = posts_per_user.value_counts().reset_index().sort_values('counts')
posts_per_user_df.columns = ['posts','users']
posts_per_user_df

In [None]:
posts_per_user.iloc[:20]

In [None]:


sns.displot(posts_per_user, x='counts', discrete=True)# , bins=posts_per_user.shape[0])
# plt.bar(x=posts_per_user_df.posts,height= posts_per_user_df.users, align='edge')# , bins=posts_per_user.shape[0])
plt.xlabel('Posts')
plt.ylabel('Users')
plt.xlim(0,200)
plt.show()
# plt.hist(posts_per_user, bins=200)
# plt.xlabel('log(Posts per user)')
# plt.xlim(0,600)
# plt.show()


In [None]:
print(posts_per_user.describe().astype(int))


# Find first SW post, and only get comments for that post

In [None]:
 # 1650105 - 1642819

In [None]:

all_submissions_sw = all_submissions[all_submissions.subreddit=='SuicideWatch']
all_submissions_sw = all_submissions_sw.sort_values(by=['author','created_utc'])
all_submissions_sw

In [None]:
# TEST confirm date is oldest to newest: PASSED
[print(timestamp2date(int(n))) for n in all_submissions_sw[all_submissions_sw.author =='--dark--phoenix--'].created_utc.values]

In [None]:
# # #Test: Make sure times are properly sorted: PASSED
# for author in all_submissions_sw.author.unique()[:100]:
#     df_author = all_submissions_sw[all_submissions_sw['author']== author]
#     print('\n\n')
#     [print(n) for n in df_author.full_link.values[:5]]
#     [print(datetime.datetime.fromtimestamp(n, tz=timezone.utc)) for n in df_author.created_utc.values[:5]]

In [None]:
all_submissions_sw_first = all_submissions_sw.drop_duplicates(subset=['author'],keep='first')
all_submissions_sw_first

In [None]:
# Take first SW post
all_submissions_sw_first = all_submissions_sw.drop_duplicates(subset=['author'],keep='first')
all_submissions_sw_first.to_csv(input_dir+f'first_sw_posts_{gen_timestamp()}.csv')


# Find comments

In [None]:
all_submissions_sw_first = pd.read_csv(input_dir+'first_sw_posts_2021-07-18-14-58-03.csv', index_col = 0)
all_submissions_sw_first

In [None]:
# There aren't post that have more than 250, so I can use simple method for searching for comments
[print(n) for n in all_submissions_sw_first[all_submissions_sw_first.num_comments>250].full_link.values]
# '82i8oz'

In [None]:
all_submissions_sw_first['id'][all_submissions_sw_first['num_comments']==0]

In [None]:
first_sw_post_ids = all_submissions_sw_first[all_submissions_sw_first.num_comments>0]['id'].values #only obtain for those >0 comments

try: os.mkdir(input_dir+'comments_i/')
except: pass

all_comments = []
print('total: ', len(first_sw_post_ids))

restart = 15000
for i, post_id in enumerate(first_sw_post_ids[restart:]):
    i+=restart
    if i%100==0:
        print(i)
    
    comments = search_comments_by_sub_id(submission_id = post_id) # search for comments
    all_comments.append(comments) 
    
    if i%1000==0 and i!=restart:
        all_comments_df = pd.concat(all_comments)        
        all_comments_df.to_csv(input_dir+f'comments_i/first_sw_comments_{gen_timestamp()}_{i}.csv')
        all_comments = []

# do for last few
if len(all_comments) > 0:
    all_comments_df = pd.concat(all_comments)        
    all_comments_df.to_csv(input_dir+f'comments_i/first_sw_comments_{gen_timestamp()}_{i}.csv')



In [None]:
# all_comments_df = pd.concat(all_comments)        
# all_comments_df.to_csv(input_dir+f'comments_i/first_sw_comments_{gen_timestamp()}_{i}.csv')

In [None]:
# comments = search_comments_by_sub_id(submission_id = post_id) # search for comments
# comments

In [None]:
# compile into one df
comments_dir= input_dir+'comments_i/'
files = os.listdir(comments_dir)
all_comments = []
for file in files:
    df_i = pd.read_csv(comments_dir+file, index_col = 0)
    all_comments.append(df_i)

all_comments_df = pd.concat(all_comments)        
all_comments_df.reset_index(drop=True).to_csv(input_dir+f'first_sw_comments_{gen_timestamp()}.csv')

In [None]:
all_comments = pd.read_csv(input_dir+'first_sw_comments_{}.csv', index_col=0)
all_comments


In [None]:
# TESTs
assert all_comments[all_comments.subreddit!='SuicideWatch'].shape[0]==0
# all_comments.link_id.unique().shape


# remove deleted and removed posts AND DUPLICATES



In [None]:
input_dir = './data/input/'
subs = pd.read_csv(input_dir+'all_subs_2021-06-08-08-20-26.csv', index_col=0)
first_sw = pd.read_csv(input_dir+'first_sw_posts_2021-07-18-14-58-03.csv', index_col=0)




In [None]:
comments = pd.read_csv(input_dir+'first_sw_comments_2021-07-20-13-01-41.csv', index_col=0)
comments

In [None]:
'''
The author of the comment removed their comment then tis consider “[deleted]” . If the comment was offensive to the mods or broken the rules of the sub-Reddit then it removed, hence “[removed]”.

'''


authors_to_remove = []
comments_to_remove = []
posts_to_remove = []



dfs = [subs,first_sw, comments]
names = ['subs','first_sw', 'comments']

for df_i, name in zip(dfs, names):
    remove_keys = ['[deleted]', '[removed]']
    if name == 'comments':
        check_cols = ['author', 'body']
    else:
        check_cols = ['author', 'title', 'selftext']
        
    for i in remove_keys:
        for col in check_cols:
            print(name, col, i, df_i[(df_i[col]==i) | (df_i[col].astype(str).str.startswith(i)) ].shape[0], f'/{df_i.shape[0]}')

#             if name == 'comments':
#                 comments_to_remove.append(df_i[df_i[col]==i])
            
            

In [None]:
comments[comments.author.astype(str).str.startswith('[deleted] ')]

In [None]:
comments[comments.body.astype(str).str.startswith('[deleted] ')]

In [None]:
# These posts were removed by moderators. Since we use text for analysis, we cannot analyze these
remove_authors = first_sw[first_sw['selftext']=='[removed]'].author.values
remove_comments = first_sw[first_sw['author'].isin(remove_authors)].id.values
remove_comments = np.array(['t3_'+n for n in remove_comments])
print(remove_authors[:10])
print(remove_authors.shape)
print(remove_comments.shape)




In [None]:
subs[subs['selftext'].astype(str).str.startswith('[deleted]')]

In [None]:
#TEST all start with t3: PASSED
print(comments.link_id.values.shape)
print(comments.link_id.str.startswith('t3_').values.shape)



In [None]:
remove_comments
print(comments.shape) 
comments = comments[~comments.link_id.isin(remove_comments)] #because some start with t3_ or tN
print(comments.shape) #around 1000


In [None]:
comments.to_csv(input_dir+f'first_sw_comments_{gen_timestamp()}.csv')

In [None]:
# TEST, should be around median of 4 comments per user: PASSED
(103042-102414)/157

In [None]:

dfs = [subs,first_sw]
names = ['all_submissions','first_sw_submission']

for df_i, name in zip(dfs, names):
    remove_keys = ['[deleted]']
    check_cols = ['author', 'title', 'selftext']
    print('\n\n=======',name)
    print(df_i.shape)
    df_i = df_i[~df_i.author.isin(remove_authors)]
    print('removed authors')
    print(df_i.shape)
    if name != 'first_sw_comments':
        for i in remove_keys:
            for col in check_cols:
                print(i, col)

                df_i =   df_i[~((df_i[col]==i) | (df_i[col].astype(str).str.startswith(i)) )]
                print(df_i.shape)
    df_i.to_csv(input_dir+f'{name}_{gen_timestamp()}.csv')


In [None]:
#since username and author are removed from both pushshift and Reddit (accessed through permalink/full_link), we keep all comments. Having comments that violate subreddit rules or are offensive can be informative (even if the text is not available)


In [None]:
# The End

In [None]:
# compile into single DF
files = os.listdir(input_dir+'submissions_old/')

all_subs = [all_submissions_sw]
files.sort()
for file in files:
    df_i = pd.read_csv(f'{input_dir}submissions_old/{file}', index_col = 0)
    all_subs.append(df_i)


all_subs = pd.concat(all_subs)

    


In [None]:
# reduce size
# https://pandas.pydata.org/pandas-docs/stable/user_guide/scale.html
all_subs.memory_usage(deep=True)

In [None]:
all_subs = all_subs.reset_index(drop=True)
all_subs = all_subs.drop('0', axis=1)

# categorical
cols = ['author','subreddit']
for col in cols:
    all_subs[col] = all_subs[col].astype("category")

all_subs.memory_usage(deep=True)

In [None]:
all_subs.describe()

In [None]:
all_subs.shape

In [None]:
all_subs.drop_duplicates().shape

In [None]:

# all_subs.to_csv(input_dir+'all_subs.csv')


In [None]:
# prob want to remove these or try to redownload with id

necessary_cols = ['author', 'created_utc', 'id', 'subreddit']
df = all_subs[necessary_cols]

all_subs[~df.isnull().any(axis=1)] #all minus those

In [None]:
# numerical
cols = ['num_comments','score']
for col in cols:
    all_subs[col] = all_subs[col].astype("int")

all_subs.memory_usage(deep=True)


# Instead of using Reddit Mental Health Dataset, you can download data yourself

In [None]:

# start = date2timestamp(date_start)
# end = date2timestamp(date_end)
# url = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&sort=desc&sort_type=created_utc&after={start}&before={end}&size={size}'


In [None]:
# # Extract out reddit data given specifications of subreddits + pre pandemic distinction, as well as window of days to extract from


# for year in range(2017,2019): #excluding 2020 due to Covid-19
#     print(year,'=======================')
#     days = list_of_days(date(year,1,1), date(year,12,31))
#     days_local = list(days)
#     size = 30 #1000 is the max. amount of submissions you can download per request.
#     size_of_subreddit = 3000 #so with size=100 and size_of_subreddit=10000 you'll get at least 100 days (10000/100) or more per year

#     for subreddit in subreddits:
#         print(subreddit)
#         subreddit_df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'submission']) #empty df
#         while subreddit_df.shape[0] < size_of_subreddit and days_local:
#             # download submissions from a random day until you reach size_of_subreddit. some days will have less than size.         
#             idx = random.randint(0, len(days_local)-1)
#             date_start = days_local.pop(idx) #pop guaranteees it won't repeat days
#             date_end = next_day(date_start)
#             df = scrape_reddit(output_dir, subreddit, date_start, date_end, size = size)
#             subreddit_df = pd.concat([subreddit_df, df])
#             time.sleep(0.1)
# #             print(subreddit)
#             print(subreddit_df.shape)
#         subreddit_df.to_csv(os.path.join(output_dir, f'{subreddit}_{year}.csv'), index=False)

In [None]:
# subreddit_df

In [None]:
# # Extract out reddit data given specifications of subreddits from 2018, as well as window of days to extract from

# days = list_of_days(date(2018,1,1), date(2018,12,31))
# size = 1000

# timeframe = '2018'

# for subreddit in subreddits[3:]:
#   subreddit_df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'submission'])
#   days_local = list(days)
#   while subreddit_df.shape[0] < 30000 and days_local:
#     idx = random.randint(0, len(days_local)-1)
#     date_start = days_local.pop(idx)
#     date_end = next_day(date_start)
#     df = scrape_reddit(output_dir, subreddit, timeframe, date_start, date_end, size = size)
#     subreddit_df = pd.concat([subreddit_df, df])
#     time.sleep(0.5)
#     print(subreddit)
#     print(subreddit_df.shape)
#   subreddit_df.to_csv(os.path.join(output_dir, '{}_{}.csv'.format(subreddit, timeframe)), index=False)

In [None]:
# # Extract out reddit data given specifications of subreddits + submission pandemic distinction, as well as window of days to extract from

# days = list_of_days(date(2018,1,1), date(2018,4,20))
# size = 1000

# timeframe = '2018'

# for subreddit in subreddits:
#   subreddit_df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'submission'])
#   days_local = list(days)
#   for date_start in days_local:
#     date_end = next_day(date_start)
#     df = scrape_reddit(output_dir, subreddit, timeframe, date_start, date_end, size = size)
#     subreddit_df = pd.concat([subreddit_df, df])
#     time.sleep(0.5)
#     print(subreddit)
#     print(subreddit_df.shape)
#   subreddit_df.to_csv(os.path.join(output_dir, '{}_{}.csv'.format(subreddit, timeframe)), index=False)