In [None]:
# Run this cell to be able to mount GDrive and attach it to the colab so that we can save json outputs
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
# Import packages to use in extracting out data.
import os
import sys
import numpy as np
import pandas as pd
import json
from pprint import pprint
import requests
import time
import random
!pip install langdetect

from langdetect import detect

from datetime import datetime, date, timedelta
import string
from scipy import stats

Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 2.8MB/s 
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.8-cp36-none-any.whl size=993193 sha256=ecf82be0b65ec5b94aca4d52ffabb066afbec0dd19b503821e75429f62c4e40b
  Stored in directory: /root/.cache/pip/wheels/8d/b3/aa/6d99de9f3841d7d3d40a60ea06e6d669e8e5012e6c8b947a57
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.8


In [None]:
# Cell that contains helpful functions for datetime operations and parsing the json output from the url query
def date2timestamp(date):
    '''
    "01/12/2011"
    '''
    return int(time.mktime(datetime.strptime(date, "%Y/%m/%d").timetuple()))

def next_day(date):
  tmrw = datetime.strptime(date, "%Y/%m/%d") + timedelta(days=1)
  return tmrw.strftime('%Y/%m/%d')

def list_of_days(start_date, end_date):
    '''
    start_date = date(2019, 1, 19)   # start date
    end_date = date(2019, 3, 22)   # end date
    '''
    delta = end_date - start_date       # as timedelta
    days = []
    for i in range(delta.days + 1):
        day = start_date + timedelta(days=i)
        days.append(str(day).replace('-', '/'))
    return days


In [None]:
# Specify output directory based on location of script

# output dir if drive has been mounted:
output_dir = '/content/drive/My Drive/ML4HC_Final_Project/data/input/raw_post/'

In [None]:
# Method used to get posts
def scrape_reddit(output_dir, subreddit, timeframe, date_start, date_end, size = 1000):
    '''
    size = {1,1000} #amount of posts
    '''
    start = date2timestamp(date_start)
    end = date2timestamp(date_end)
    # use the pushshift api to extract out data
    url = 'https://api.pushshift.io/reddit/search/submission/?subreddit={}&sort=desc&sort_type=created_utc&after={}&before={}&size={}'.format(subreddit,start, end, size)
    print(url)
    try:
        posts = requests.get(url)
        posts = posts.json()
        posts = posts['data']
    except:
        time.sleep(30)
        posts = requests.get(url)
        posts = posts.json()
        posts = posts['data']

    df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'post'])
    
    for post in posts:
      if 'selftext' in post: # check if selftext parameter exists
        text = post['selftext']
        if text != "" and  text != '[removed]' and '[deleted]' not in text: # further check if selftext is not empty
          try: 
            if detect(text) == 'en': # check if text is in english - if the language detected is not in the langdetect library, then continue to the next post
              df = df.append({'subreddit': subreddit, 'author': post['author'], 'date': date_start, 'post': post['title'] + ' ' + post['selftext']}, ignore_index=True)
          except:
            continue
    return df

In [None]:
# Subreddits to scrape:

subreddits = ['suicidewatch',
              'depression',
              'ptsd',
              'anxiety',
              'socialanxiety',
              'healthanxiety',
              'bipolarreddit',
              'bpd',
              'schizophrenia',
              'paranoia',
              'EDAnonymous',
              'alcoholism',
              'addiction',
              'adhd',
              'mindfulness', 
              'psychosis', 
              'mentalillness',
              'mentalhealth', #end of subreddits used in both pre- and mid-pandemic analysis
              'meditation',
              'personalfinance',
              'jokes',
              'teaching',
              'relationships',
              'legaladvice',
              'fitness',
              'parenting',
              'COVID19_support',
              'lonely',
              'autism',
              'divorce',
              'guns',
              'economy',
              'ForeverAlone',
              'politics',
              'ukpolitics',
              'CanadaPolitics',
              'conspiracy',
              'India',
              'Canada',
              'unitedkingdom',
              'UKPersonalFinance',
              'MakeNewFriendsHere'# end of subreddits only used for mid-pandemic analysis/clustering
              ]

In [None]:
# Extract out reddit data given specifications of subreddits + pre pandemic distinction, as well as window of days to extract from

days = list_of_days(date(2018,11,1), date(2019,11,1))
size = 1000

timeframe = 'pre'

for subreddit in subreddits[]:
  subreddit_df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'post'])
  days_local = list(days)
  while subreddit_df.shape[0] < 30000 and days_local:
    idx = random.randint(0, len(days_local)-1)
    date_start = days_local.pop(idx)
    date_end = next_day(date_start)
    df = scrape_reddit(output_dir, subreddit, timeframe, date_start, date_end, size = size)
    subreddit_df = pd.concat([subreddit_df, df])
    time.sleep(0.5)
    print(subreddit)
    print(subreddit_df.shape)
  subreddit_df.to_csv(os.path.join(output_dir, '{}_{}.csv'.format(subreddit, timeframe)), index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1556409600&before=1556496000&size=1000
anxiety
(7157, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1567123200&before=1567209600&size=1000
anxiety
(7349, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1554422400&before=1554508800&size=1000
anxiety
(7522, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1543881600&before=1543968000&size=1000
anxiety
(7727, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1559260800&before=1559347200&size=1000
anxiety
(7893, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=15581376

JSONDecodeError: ignored

In [None]:
# Extract out reddit data given specifications of subreddits from 2018, as well as window of days to extract from

days = list_of_days(date(2018,1,1), date(2018,12,31))
size = 1000

timeframe = '2018'

for subreddit in subreddits[3:]:
  subreddit_df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'post'])
  days_local = list(days)
  while subreddit_df.shape[0] < 30000 and days_local:
    idx = random.randint(0, len(days_local)-1)
    date_start = days_local.pop(idx)
    date_end = next_day(date_start)
    df = scrape_reddit(output_dir, subreddit, timeframe, date_start, date_end, size = size)
    subreddit_df = pd.concat([subreddit_df, df])
    time.sleep(0.5)
    print(subreddit)
    print(subreddit_df.shape)
  subreddit_df.to_csv(os.path.join(output_dir, '{}_{}.csv'.format(subreddit, timeframe)), index=False)

https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1533254400&before=1533340800&size=1000
anxiety
(150, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1520985600&before=1521072000&size=1000
anxiety
(269, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1521676800&before=1521763200&size=1000
anxiety
(421, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1546128000&before=1546214400&size=1000
anxiety
(570, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1535587200&before=1535673600&size=1000
anxiety
(745, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=anxiety&sort=desc&sort_type=created_utc&after=1520121600&before=1520208000&size=1000
anxiety
(861, 4)
https://api.pushshift.

JSONDecodeError: ignored

In [None]:
# Extract out reddit data given specifications of subreddits + post pandemic distinction, as well as window of days to extract from

days = list_of_days(date(2018,1,1), date(2018,4,20))
size = 1000

timeframe = '2018'

for subreddit in subreddits:
  subreddit_df = pd.DataFrame(columns=['subreddit', 'author', 'date', 'post'])
  days_local = list(days)
  for date_start in days_local:
    date_end = next_day(date_start)
    df = scrape_reddit(output_dir, subreddit, timeframe, date_start, date_end, size = size)
    subreddit_df = pd.concat([subreddit_df, df])
    time.sleep(0.5)
    print(subreddit)
    print(subreddit_df.shape)
  subreddit_df.to_csv(os.path.join(output_dir, '{}_{}.csv'.format(subreddit, timeframe)), index=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(0, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=COVID19_support&sort=desc&sort_type=created_utc&after=1522886400&before=1522972800&size=1000
COVID19_support
(0, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=COVID19_support&sort=desc&sort_type=created_utc&after=1522972800&before=1523059200&size=1000
COVID19_support
(0, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=COVID19_support&sort=desc&sort_type=created_utc&after=1523059200&before=1523145600&size=1000
COVID19_support
(0, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=COVID19_support&sort=desc&sort_type=created_utc&after=1523145600&before=1523232000&size=1000
COVID19_support
(0, 4)
https://api.pushshift.io/reddit/search/submission/?subreddit=COVID19_support&sort=desc&sort_type=created_utc&after=1523232000&before=1523318400&size=1000
COVID19_support
(0, 4)
https://api.pushshift.io/reddit/search/subm