# Scrape SubReddit Pages and Preliminary cleaning

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import csv
import time
import datetime as dt
import pandas as pd
from pmaw import PushshiftAPI

## Define a function to scrape data from SubReddit pages

PMAW: Under the hood, pmaw makes several API requests to Pushshift that each return a maximum of 100 comments, with requests being subjected to a rate-limit of 60 requests per minute. 


https://medium.com/swlh/how-to-scrape-large-amounts-of-reddit-data-using-pushshift-1d33bde9286


In [2]:
api = PushshiftAPI()

In [3]:
# Function to scrape subreddit submissions (different than comments)

def get_reddit_submissions(subreddit, limit):
    subreddit = subreddit
    limit = limit
    posts = api.search_submissions(subreddit=subreddit, 
                                   limit=limit, 
                                   before=before, 
                                   after=after)
    post_list = [post for post in posts]
    df = pd.DataFrame(post_list)
    return df

## Scrape Data from Pre-COVID Shutdown

Using the Pushshift API, scrape reddit data from before Covid Shutdown for each subredit of interest (Using 3/20/2020 as that was the date New York and California shut down)

In [4]:
before = int(dt.datetime(2020,3,20,0,0).timestamp()) # converts to UTC
after = int(dt.datetime(2017,12,1,0,0).timestamp()) # converts to UTC

### r/mentalhealth

#### Scrape comments

In [5]:
mentalhealth_PreCovid = get_reddit_submissions('mentalhealth', 4000)

Total:: Success Rate: 100.00% - Requests: 40 - Batches: 4 - Items Remaining: 0


In [6]:
# Check the shape to make sure all rows are there
mentalhealth_PreCovid.shape

(4000, 76)

In [13]:
#mentalhealth_PreCovid.head()

#### Save columns of interest

In [7]:
# Keep columns: 'author', 'body', 'created_utc', 'subreddit'
mentalhealth_PreCovid = mentalhealth_PreCovid[['author', 
                                               'title', 
                                               'selftext',
                                               'created_utc', 
                                               'subreddit']]

### r/depression

#### Scrape comments

In [8]:
depression_PreCovid = get_reddit_submissions('depression', 4000)

Total:: Success Rate: 100.00% - Requests: 40 - Batches: 4 - Items Remaining: 0


In [9]:
depression_PreCovid.shape

(4000, 70)

#### Save columns of interest

In [10]:
depression_PreCovid = depression_PreCovid[['author', 
                                           'title', 
                                           'selftext',
                                           'created_utc', 
                                           'subreddit']]

### r/Anxiety

#### Scrape comments

In [11]:
anxiety_PreCovid = get_reddit_submissions('Anxiety', 4000)

Total:: Success Rate: 100.00% - Requests: 40 - Batches: 4 - Items Remaining: 0


In [12]:
anxiety_PreCovid.shape

(4000, 82)

#### Save columns of interest

In [13]:
# Keep columns: 'author', 'body', 'created_utc', 'subreddit'
anxiety_PreCovid = anxiety_PreCovid[['author', 
                                     'title', 
                                     'selftext',
                                     'created_utc', 
                                     'subreddit']]

### r/bipolar

#### Scrape comments

In [14]:
bipolar_PreCovid = get_reddit_submissions('bipolar', 4000)

Total:: Success Rate: 100.00% - Requests: 40 - Batches: 4 - Items Remaining: 0


In [15]:
bipolar_PreCovid.shape

(4000, 81)

#### Save columns of interest

In [16]:
bipolar_PreCovid = bipolar_PreCovid[['author', 
                                     'title', 
                                     'selftext',
                                     'created_utc', 
                                     'subreddit']]

### r/BPD

##### Scrape comments

In [17]:
bpd_PreCovid = get_reddit_submissions('BPD', 4000)

Total:: Success Rate: 100.00% - Requests: 40 - Batches: 4 - Items Remaining: 0


In [18]:
bpd_PreCovid.shape

(4000, 92)

##### Save columns of interest

In [19]:
bpd_PreCovid = bpd_PreCovid[['author', 
                             'title', 
                             'selftext',
                             'created_utc', 
                             'subreddit']]

### r/schizophrenia

In [20]:
schizophrenia_PreCovid = get_reddit_submissions('schizophrenia', 4000)

Total:: Success Rate: 100.00% - Requests: 40 - Batches: 4 - Items Remaining: 0


In [21]:
schizophrenia_PreCovid.shape

(4000, 83)

##### Save columns of interest

In [22]:
schizophrenia_PreCovid = schizophrenia_PreCovid[['author', 
                                                 'title', 
                                                 'selftext',
                                                 'created_utc', 
                                                 'subreddit']]

### r/autism

In [23]:
autism_PreCovid = get_reddit_submissions('autism', 4000)

Total:: Success Rate: 100.00% - Requests: 40 - Batches: 4 - Items Remaining: 0


In [24]:
autism_PreCovid.shape

(4000, 94)

##### Save columns of interest

In [25]:
autism_PreCovid = autism_PreCovid[['author', 
                                   'title', 
                                   'selftext',
                                   'created_utc', 
                                   'subreddit']]

### r/AnorexiaNervosa

In [26]:
anorexia_PreCovid = get_reddit_submissions('AnorexiaNervosa', 4000)

Total:: Success Rate: 100.00% - Requests: 48 - Batches: 5 - Items Remaining: 0


In [27]:
anorexia_PreCovid.shape

(4000, 80)

##### Save columns of interest

In [28]:
anorexia_PreCovid = anorexia_PreCovid[['author', 
                                       'title', 
                                       'selftext',
                                       'created_utc', 
                                       'subreddit']]

### r/Bulimia

In [29]:
bulimia_PreCovid = get_reddit_submissions('Bulimia', 4000)

Total:: Success Rate: 100.00% - Requests: 44 - Batches: 5 - Items Remaining: 0


In [30]:
bulimia_PreCovid.shape

(4000, 76)

##### Save columns of interest

In [31]:
bulimia_PreCovid = bulimia_PreCovid[['author', 
                                     'title', 
                                     'selftext',
                                     'created_utc', 
                                     'subreddit']]

### Combine data

#### Combine data to one large data frame

In [32]:
precovid_frames = [bulimia_PreCovid, 
                   anorexia_PreCovid,
                   autism_PreCovid,
                   schizophrenia_PreCovid,
                   bpd_PreCovid,
                   bipolar_PreCovid,
                   anxiety_PreCovid,
                   depression_PreCovid,
                   mentalhealth_PreCovid]

In [33]:
PreCovid_subreddits = pd.concat(precovid_frames)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [34]:
# remove rows where 'author' == 'deleted' as these are posts that were deleted by the author
drop_rows1 = PreCovid_subreddits.loc[PreCovid_subreddits['author'] == '[deleted]'].index

In [35]:
# Replace 'deleted' and 'removed' with null and then re
PreCovid_subreddits.drop(drop_rows1, inplace=True)

In [36]:
PreCovid_subreddits.shape

(29961, 5)

## Scrape Data from Post-COVID Shutdown

In [37]:
before = int(dt.datetime(2021,3,1,0,0).timestamp()) # converts to UTC 
after = int(dt.datetime(2020,3,21,0,0).timestamp()) # converts to UTC # A chosen 'shutdown' date

### r/mentalhealth

In [38]:
mentalhealth_PostCovid = get_reddit_submissions('mentalhealth', 4000)

Total:: Success Rate: 100.00% - Requests: 40 - Batches: 4 - Items Remaining: 0


In [39]:
mentalhealth_PostCovid.shape 

(4000, 74)

##### Save columns of interest

In [40]:
mentalhealth_PostCovid = mentalhealth_PostCovid[['author', 
                                                 'title', 
                                                 'selftext', 
                                                 'created_utc', 
                                                 'subreddit']]

### r/depression

In [41]:
depression_PostCovid = get_reddit_submissions('depression', 4000)

Total:: Success Rate: 95.24% - Requests: 42 - Batches: 5 - Items Remaining: 0


In [42]:
depression_PostCovid.shape

(4000, 66)

##### Save columns of interest

In [43]:
depression_PostCovid = depression_PostCovid[['author', 
                                             'title', 
                                             'selftext', 
                                             'created_utc', 
                                             'subreddit']]

### r/Anxiety

In [44]:
anxiety_PostCovid = get_reddit_submissions('Anxiety', 4000)

Total:: Success Rate: 95.24% - Requests: 42 - Batches: 5 - Items Remaining: 0


In [45]:
anxiety_PostCovid.shape 

(4000, 70)

##### Save columns of interest

In [46]:
anxiety_PostCovid = anxiety_PostCovid[['author', 
                                       'title', 
                                       'selftext', 
                                       'created_utc', 
                                       'subreddit']]

### r/bipolar

In [47]:
bipolar_PostCovid = get_reddit_submissions('bipolar', 4000)

Total:: Success Rate: 74.07% - Requests: 54 - Batches: 6 - Items Remaining: 0


In [48]:
bipolar_PostCovid.shape

(4000, 80)

##### Save columns of interest

In [49]:
bipolar_PostCovid = bipolar_PostCovid[['author', 
                                       'title', 
                                       'selftext',
                                       'created_utc', 
                                       'subreddit']]

### r/BPD

In [50]:
bpd_PostCovid = get_reddit_submissions('BPD', 4000)

Total:: Success Rate: 61.54% - Requests: 65 - Batches: 7 - Items Remaining: 0


In [51]:
bpd_PostCovid.shape

(4000, 83)

##### Save columns of interest

In [52]:
bpd_PostCovid = bpd_PostCovid[['author',
                               'title', 
                               'selftext',
                               'created_utc', 
                               'subreddit']]

### r/schizophrenia

In [53]:
schizophrenia_PostCovid = get_reddit_submissions('schizophrenia', 4000)

Total:: Success Rate: 66.67% - Requests: 60 - Batches: 6 - Items Remaining: 0


In [54]:
schizophrenia_PostCovid.shape 

(4000, 82)

##### Save columns of interest

In [55]:
schizophrenia_PostCovid = schizophrenia_PostCovid[['author', 
                                                   'title', 
                                                   'selftext',
                                                   'created_utc',
                                                   'subreddit']]

### r/autism

In [56]:
autism_PostCovid = get_reddit_submissions('autism', 4000)

Total:: Success Rate: 66.67% - Requests: 60 - Batches: 6 - Items Remaining: 0


In [57]:
autism_PostCovid.shape

(4000, 83)

##### Save columns of interest

In [58]:
autism_PostCovid = autism_PostCovid[['author', 
                                     'title', 
                                     'selftext',
                                     'created_utc', 
                                     'subreddit']]

### r/AnorexiaNervosa

In [59]:
anorexia_PostCovid = get_reddit_submissions('AnorexiaNervosa', 4000)

Total:: Success Rate: 66.67% - Requests: 60 - Batches: 6 - Items Remaining: 0


In [60]:
anorexia_PostCovid.shape

(4000, 83)

##### Save columns of interest

In [61]:
anorexia_PostCovid = anorexia_PostCovid[['author', 
                                         'title', 
                                         'selftext',
                                         'created_utc', 
                                         'subreddit']]

### r/Bulimia

In [62]:
bulimia_PostCovid = get_reddit_submissions('Bulimia', 4000)

Total:: Success Rate: 68.75% - Requests: 64 - Batches: 7 - Items Remaining: 0


In [63]:
bulimia_PostCovid.shape

(4000, 84)

##### Save columns of interest

In [64]:
bulimia_PostCovid = bulimia_PostCovid[['author', 
                                       'title', 
                                       'selftext',
                                       'created_utc', 
                                       'subreddit']]

### Combine data

#### Combine data to one large data frame

In [65]:
postcovid_frames = [bulimia_PostCovid, 
                   anorexia_PostCovid,
                   autism_PostCovid,
                   schizophrenia_PostCovid,
                   bpd_PostCovid,
                   bipolar_PostCovid,
                   anxiety_PostCovid,
                   depression_PostCovid,
                   mentalhealth_PostCovid]

In [66]:
PostCovid_subreddits = pd.concat(postcovid_frames)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [67]:
PostCovid_subreddits['author'].value_counts()

[deleted]          1003
Glitterx37           53
justacotton          51
AutoModerator        40
autgeb               33
                   ... 
IIluciferinII         1
jinisin176            1
1531004               1
Imstuck2456           1
AdorableStable1       1
Name: author, Length: 25424, dtype: int64

In [68]:
drop_rows2 = PostCovid_subreddits.loc[PostCovid_subreddits['author'] == '[deleted]'].index

In [69]:
# Replace 'deleted' and 'removed' with null and then re
PostCovid_subreddits.drop(drop_rows2, inplace = True)

In [70]:
PostCovid_subreddits.shape

(28098, 5)

#### Combine pre and post Covid data frames

In [71]:
# add column to each data frame saying 'pre or post covid'
PreCovid_subreddits['timeframe'] = 'pre-covid'
PostCovid_subreddits['timeframe'] = 'post-covid'

In [72]:
# Combine data frames
subreddit_posts_combined = pd.concat([PreCovid_subreddits, PostCovid_subreddits], axis=0)

In [73]:
subreddit_posts_combined.head()

Unnamed: 0,author,title,selftext,created_utc,subreddit,timeframe
2,LunsheaPyralis,Beginning my road to recovery...again,"I am 32, I was diagnosed as anorexic in high s...",1519141081,bulimia,pre-covid
3,lotrwisc,Relapse related to ex,I've had bulimia for 10 years. When I was in c...,1519096140,bulimia,pre-covid
6,collali699,Bulimia is Torture,Many people think it's so nice to eat and not ...,1518983066,bulimia,pre-covid
13,hrbitch,Drinking a lot,I have been purging with laxatives and purging...,1518572435,bulimia,pre-covid
14,firexsign,It sucks being a fat(ter) bulimic,"When I first became bulimic, it was after a pr...",1518569824,bulimia,pre-covid


#### Delete duplicates

In [75]:
subreddit_posts_combined.drop_duplicates(subset='selftext', keep="first", inplace=True)

#### Combine text columns

In [87]:
# Combine title with self text column and then check NAs again
subreddit_posts_combined['text'] = subreddit_posts_combined[['title', 
                                                             'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
subreddit_posts_combined.drop(columns = ['title','selftext'], inplace=True)

#### Anonmyize


In [88]:
# Replace unique author id with integer.
subreddit_posts_combined['author'] = subreddit_posts_combined.assign(author=(np.unique(subreddit_posts_combined.author, 
                                          return_inverse=True)[1] + 1).astype(str))

In [89]:
# Add 'sub' infront of integer to remind ourselves that this is an id. 
subreddit_posts_combined['author'] = subreddit_posts_combined['author'].apply(lambda x: f"sub{x}")

#### Save to csv

In [93]:
# Save combined data
subreddit_posts_combined.to_csv('../data/subreddit_posts_combined.csv', 
                           header=True, 
                           index=False,
                           columns=list(subreddit_posts_combined.axes[1]))