# Scrape SubReddit Pages and Preliminary cleaning

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import csv
import time
import datetime as dt
import pandas as pd
from pmaw import PushshiftAPI

## Define a function to scrape data from SubReddit pages

PMAW: Under the hood, pmaw makes several API requests to Pushshift that each return a maximum of 100 comments, with requests being subjected to a rate-limit of 60 requests per minute. 


https://medium.com/swlh/how-to-scrape-large-amounts-of-reddit-data-using-pushshift-1d33bde9286


In [2]:
api = PushshiftAPI()

In [3]:
# Function to scrape subreddit submissions (different than comments)

def get_reddit_submissions(subreddit, limit):
    subreddit = subreddit
    limit = limit
    posts = api.search_submissions(subreddit=subreddit, 
                                   limit=limit, 
                                   before=before, 
                                   after=after)
    post_list = [post for post in posts]
    df = pd.DataFrame(post_list)
    return df

## Scrape Data from Pre-COVID Shutdown

Using the Pushshift API, scrape reddit data from before Covid Shutdown for each subredit of interest (Using 3/20/2020 as that was the date New York and California shut down)

In [4]:
before = int(dt.datetime(2020,3,20,0,0).timestamp()) # converts to UTC # A chosen 'shutdown' date
after = int(dt.datetime(2017,3,20,0,0).timestamp()) # converts to UTC 

### r/mentalhealth

#### Scrape comments

In [5]:
mentalhealth_PreCovid = get_reddit_submissions('mentalhealth', 5000)

Total:: Success Rate: 100.00% - Requests: 50 - Batches: 5 - Items Remaining: 0


In [6]:
# Check the shape to make sure all rows are there
mentalhealth_PreCovid.shape

(5000, 84)

#### Save columns of interest

In [7]:
# Keep columns: 'author', 'body', 'created_utc', 'subreddit'
mentalhealth_PreCovid = mentalhealth_PreCovid[['author', 
                                               'title', 
                                               'selftext',
                                               'created_utc', 
                                               'subreddit']]

#### Combine title with self text column 

In [9]:
mentalhealth_PreCovid['text'] = mentalhealth_PreCovid[['title', 
                                                       'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
mentalhealth_PreCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [15]:
drop_rows2 = mentalhealth_PreCovid.loc[mentalhealth_PreCovid['author'] == '[deleted]'].index

In [16]:
# Replace 'deleted' and 'removed' with null and then re
mentalhealth_PreCovid.drop(drop_rows2, inplace = True)

#### Drop duplicates

In [20]:
mentalhealth_PreCovid = mentalhealth_PreCovid.drop_duplicates(subset='text', keep="first")

In [21]:
mentalhealth_PreCovid.shape

(4631, 4)

### r/depression

#### Scrape comments

In [27]:
depression_PreCovid = get_reddit_submissions('depression', 5000)

Total:: Success Rate: 74.63% - Requests: 67 - Batches: 7 - Items Remaining: 0


In [28]:
depression_PreCovid.shape

(5000, 78)

#### Save columns of interest

In [29]:
depression_PreCovid = depression_PreCovid[['author', 
                                           'title', 
                                           'selftext',
                                           'created_utc', 
                                           'subreddit']]

#### Combine title with self text column and then check NAs again

In [39]:
depression_PreCovid['text'] = depression_PreCovid[['title', 
                                                   'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
depression_PreCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [40]:
drop_rows = depression_PreCovid.loc[depression_PreCovid['author'] == '[deleted]'].index

In [41]:
# Replace 'deleted' and 'removed' with null and then re
depression_PreCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [42]:
depression_PreCovid = depression_PreCovid.drop_duplicates(subset='text', keep="first")

In [43]:
depression_PreCovid.shape

(4521, 4)

### r/Anxiety

#### Scrape comments

In [44]:
anxiety_PreCovid = get_reddit_submissions('Anxiety', 5000)

Total:: Success Rate: 90.91% - Requests: 55 - Batches: 6 - Items Remaining: 0


In [45]:
anxiety_PreCovid.shape

(5000, 87)

#### Save columns of interest

In [46]:
# Keep columns: 'author', 'body', 'created_utc', 'subreddit'
anxiety_PreCovid = anxiety_PreCovid[['author', 
                                     'title', 
                                     'selftext',
                                     'created_utc', 
                                     'subreddit']]

#### Combine title with self text column

In [49]:
anxiety_PreCovid['text'] = anxiety_PreCovid[['title', 
                                            'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
anxiety_PreCovid.drop(columns = ['title','selftext'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [50]:
drop_rows = anxiety_PreCovid.loc[anxiety_PreCovid['author'] == '[deleted]'].index

In [51]:
# Replace 'deleted' and 'removed' with null and then re
anxiety_PreCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [52]:
anxiety_PreCovid = anxiety_PreCovid.drop_duplicates(subset='text', keep="first")

In [53]:
anxiety_PreCovid.shape

(4581, 4)

### r/bipolar

#### Scrape comments

In [47]:
bipolar_PreCovid = get_reddit_submissions('bipolar', 5000)

Total:: Success Rate: 100.00% - Requests: 50 - Batches: 5 - Items Remaining: 0


In [48]:
bipolar_PreCovid.shape

(5000, 88)

#### Save columns of interest

In [55]:
bipolar_PreCovid = bipolar_PreCovid[['author', 
                                     'title', 
                                     'selftext',
                                     'created_utc', 
                                     'subreddit']]

#### Combine title with self text column

In [56]:
bipolar_PreCovid['text'] = bipolar_PreCovid[['title', 
                                            'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
bipolar_PreCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [57]:
drop_rows = bipolar_PreCovid.loc[bipolar_PreCovid['author'] == '[deleted]'].index

In [58]:
# Replace 'deleted' and 'removed' with null and then re
bipolar_PreCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [59]:
bipolar_PreCovid = bipolar_PreCovid.drop_duplicates(subset='text', keep="first")

In [60]:
bipolar_PreCovid.shape

(4632, 4)

### r/BPD

##### Scrape comments

In [62]:
bpd_PreCovid = get_reddit_submissions('BPD', 5000)

Total:: Success Rate: 100.00% - Requests: 50 - Batches: 5 - Items Remaining: 0


In [63]:
bpd_PreCovid.shape

(5000, 95)

##### Save columns of interest

In [64]:
bpd_PreCovid = bpd_PreCovid[['author', 
                             'title', 
                             'selftext',
                             'created_utc', 
                             'subreddit']]

#### Combine title with self text column

In [65]:
bpd_PreCovid['text'] = bpd_PreCovid[['title', 
                                            'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
bpd_PreCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [66]:
drop_rows = bpd_PreCovid.loc[bpd_PreCovid['author'] == '[deleted]'].index

In [67]:
# Replace 'deleted' and 'removed' with null and then re
bpd_PreCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [68]:
bpd_PreCovid = bpd_PreCovid.drop_duplicates(subset='text', keep="first")

In [69]:
bpd_PreCovid.shape

(4553, 4)

### r/schizophrenia

In [70]:
schizophrenia_PreCovid = get_reddit_submissions('schizophrenia', 5000)

Checkpoint:: Success Rate: 54.35% - Requests: 92 - Batches: 10 - Items Remaining: 0
Total:: Success Rate: 54.35% - Requests: 92 - Batches: 10 - Items Remaining: 0


In [71]:
schizophrenia_PreCovid.shape

(5000, 89)

##### Save columns of interest

In [72]:
schizophrenia_PreCovid = schizophrenia_PreCovid[['author', 
                                                 'title', 
                                                 'selftext',
                                                 'created_utc', 
                                                 'subreddit']]

#### Combine title with self text column

In [75]:
schizophrenia_PreCovid['text'] = schizophrenia_PreCovid[['title', 
                                                         'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
schizophrenia_PreCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [76]:
drop_rows = schizophrenia_PreCovid.loc[schizophrenia_PreCovid['author'] == '[deleted]'].index

In [77]:
# Replace 'deleted' and 'removed' with null and then re
schizophrenia_PreCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [78]:
schizophrenia_PreCovid = schizophrenia_PreCovid.drop_duplicates(subset='text', keep="first")

In [79]:
schizophrenia_PreCovid.shape

(4776, 4)

### r/autism

In [80]:
autism_PreCovid = get_reddit_submissions('autism', 5000)

Total:: Success Rate: 100.00% - Requests: 50 - Batches: 5 - Items Remaining: 0


In [81]:
autism_PreCovid.shape

(5000, 98)

##### Save columns of interest

In [82]:
autism_PreCovid = autism_PreCovid[['author', 
                                   'title', 
                                   'selftext',
                                   'created_utc', 
                                   'subreddit']]

#### Combine title with self text column

In [83]:
autism_PreCovid['text'] = autism_PreCovid[['title', 
                                           'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
autism_PreCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [84]:
drop_rows = autism_PreCovid.loc[autism_PreCovid['author'] == '[deleted]'].index

In [85]:
# Replace 'deleted' and 'removed' with null and then re
autism_PreCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [86]:
autism_PreCovid = autism_PreCovid.drop_duplicates(subset='text', keep="first")

In [87]:
autism_PreCovid.shape

(4682, 4)

### r/AnorexiaNervosa

In [89]:
anorexia_PreCovid = get_reddit_submissions('AnorexiaNervosa', 5000)

Total:: Success Rate: 100.00% - Requests: 61 - Batches: 7 - Items Remaining: 0


In [90]:
anorexia_PreCovid.shape

(5000, 87)

##### Save columns of interest

In [91]:
anorexia_PreCovid = anorexia_PreCovid[['author', 
                                       'title', 
                                       'selftext',
                                       'created_utc', 
                                       'subreddit']]

#### Combine title with self text column

In [92]:
anorexia_PreCovid['text'] = anorexia_PreCovid[['title', 
                                               'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
anorexia_PreCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [93]:
drop_rows = anorexia_PreCovid.loc[anorexia_PreCovid['author'] == '[deleted]'].index

In [94]:
# Replace 'deleted' and 'removed' with null and then re
anorexia_PreCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [95]:
anorexia_PreCovid = anorexia_PreCovid.drop_duplicates(subset='text', keep="first")

In [96]:
anorexia_PreCovid.shape

(4976, 4)

### r/Bulimia

In [97]:
bulimia_PreCovid = get_reddit_submissions('Bulimia', 5000)

Total:: Success Rate: 100.00% - Requests: 59 - Batches: 6 - Items Remaining: 0


In [98]:
bulimia_PreCovid.shape

(5000, 85)

##### Save columns of interest

In [99]:
bulimia_PreCovid = bulimia_PreCovid[['author', 
                                     'title', 
                                     'selftext',
                                     'created_utc', 
                                     'subreddit']]

#### Combine title with self text column

In [100]:
bulimia_PreCovid['text'] = bulimia_PreCovid[['title', 
                                             'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
bulimia_PreCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [101]:
drop_rows = bulimia_PreCovid.loc[bulimia_PreCovid['author'] == '[deleted]'].index

In [102]:
# Replace 'deleted' and 'removed' with null and then re
bulimia_PreCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [103]:
bulimia_PreCovid = bulimia_PreCovid.drop_duplicates(subset='text', keep="first")

In [104]:
bulimia_PreCovid.shape

(4906, 4)

### Combine data

#### Combine data to one large data frame

In [105]:
precovid_frames = [bulimia_PreCovid, 
                   anorexia_PreCovid,
                   autism_PreCovid,
                   schizophrenia_PreCovid,
                   bpd_PreCovid,
                   bipolar_PreCovid,
                   anxiety_PreCovid,
                   depression_PreCovid,
                   mentalhealth_PreCovid]

In [106]:
PreCovid_subreddits = pd.concat(precovid_frames)

In [107]:
PreCovid_subreddits.shape

(42258, 4)

## Scrape Data from Post-COVID Shutdown

In [108]:
before = int(dt.datetime(2021,3,12,0,0).timestamp()) # converts to UTC 
after = int(dt.datetime(2020,3,20,0,0).timestamp()) # converts to UTC # A chosen 'shutdown' date

### r/mentalhealth

In [109]:
mentalhealth_PostCovid = get_reddit_submissions('mentalhealth', 5000)

Total:: Success Rate: 100.00% - Requests: 50 - Batches: 5 - Items Remaining: 0


In [110]:
mentalhealth_PostCovid.shape 

(5000, 76)

##### Save columns of interest

In [111]:
mentalhealth_PostCovid = mentalhealth_PostCovid[['author', 
                                                 'title', 
                                                 'selftext', 
                                                 'created_utc', 
                                                 'subreddit']]

#### Combine title with self text column

In [112]:
mentalhealth_PostCovid['text'] = mentalhealth_PostCovid[['title', 
                                                         'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
mentalhealth_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [113]:
drop_rows = mentalhealth_PostCovid.loc[mentalhealth_PostCovid['author'] == '[deleted]'].index

In [114]:
# Replace 'deleted' and 'removed' with null and then re
mentalhealth_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [115]:
mentalhealth_PostCovid = mentalhealth_PostCovid.drop_duplicates(subset='text', keep="first")

In [116]:
mentalhealth_PostCovid.shape

(4819, 4)

### r/depression

In [117]:
depression_PostCovid = get_reddit_submissions('depression', 5000)

Total:: Success Rate: 100.00% - Requests: 50 - Batches: 5 - Items Remaining: 0


In [118]:
depression_PostCovid.shape

(5000, 67)

##### Save columns of interest

In [119]:
depression_PostCovid = depression_PostCovid[['author', 
                                             'title', 
                                             'selftext', 
                                             'created_utc', 
                                             'subreddit']]

#### Combine title with self text column

In [120]:
depression_PostCovid['text'] = depression_PostCovid[['title', 
                                                     'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
depression_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [121]:
drop_rows = depression_PostCovid.loc[depression_PostCovid['author'] == '[deleted]'].index

In [122]:
# Replace 'deleted' and 'removed' with null and then re
depression_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [123]:
depression_PostCovid = depression_PostCovid.drop_duplicates(subset='text', keep="first")

In [124]:
depression_PostCovid.shape

(4797, 4)

### r/Anxiety

In [125]:
anxiety_PostCovid = get_reddit_submissions('Anxiety', 5000)

Total:: Success Rate: 65.79% - Requests: 76 - Batches: 8 - Items Remaining: 0


In [126]:
anxiety_PostCovid.shape 

(5000, 70)

##### Save columns of interest

In [127]:
anxiety_PostCovid = anxiety_PostCovid[['author', 
                                       'title', 
                                       'selftext', 
                                       'created_utc', 
                                       'subreddit']]

#### Combine title with self text column

In [129]:
anxiety_PostCovid['text'] = anxiety_PostCovid[['title', 
                                               'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
anxiety_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [130]:
drop_rows = anxiety_PostCovid.loc[anxiety_PostCovid['author'] == '[deleted]'].index

In [131]:
# Replace 'deleted' and 'removed' with null and then re
anxiety_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [132]:
anxiety_PostCovid = anxiety_PostCovid.drop_duplicates(subset='text', keep="first")

In [133]:
anxiety_PostCovid.shape

(4874, 4)

### r/bipolar

In [134]:
bipolar_PostCovid = get_reddit_submissions('bipolar', 5000)

Total:: Success Rate: 66.67% - Requests: 75 - Batches: 8 - Items Remaining: 0


In [135]:
bipolar_PostCovid.shape

(5000, 81)

##### Save columns of interest

In [136]:
bipolar_PostCovid = bipolar_PostCovid[['author', 
                                       'title', 
                                       'selftext',
                                       'created_utc', 
                                       'subreddit']]

#### Combine title with self text column

In [138]:
bipolar_PostCovid['text'] = bipolar_PostCovid[['title', 
                                               'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
bipolar_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [139]:
drop_rows = bipolar_PostCovid.loc[bipolar_PostCovid['author'] == '[deleted]'].index

In [140]:
# Replace 'deleted' and 'removed' with null and then re
bipolar_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [141]:
bipolar_PostCovid = bipolar_PostCovid.drop_duplicates(subset='text', keep="first")

In [142]:
bipolar_PostCovid.shape

(4862, 4)

### r/BPD

In [143]:
bpd_PostCovid = get_reddit_submissions('BPD', 5000)

Checkpoint:: Success Rate: 48.00% - Requests: 100 - Batches: 10 - Items Remaining: 200
Total:: Success Rate: 49.02% - Requests: 102 - Batches: 11 - Items Remaining: 0


In [144]:
bpd_PostCovid.shape

(5000, 81)

##### Save columns of interest

In [145]:
bpd_PostCovid = bpd_PostCovid[['author',
                               'title', 
                               'selftext',
                               'created_utc', 
                               'subreddit']]

#### Combine title with self text column

In [146]:
bpd_PostCovid['text'] = bpd_PostCovid[['title','selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
bpd_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [147]:
drop_rows = bpd_PostCovid.loc[bpd_PostCovid['author'] == '[deleted]'].index

In [148]:
# Replace 'deleted' and 'removed' with null and then re
bpd_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [149]:
bpd_PostCovid = bpd_PostCovid.drop_duplicates(subset='text', keep="first")

In [150]:
bpd_PostCovid.shape

(4792, 4)

### r/schizophrenia

In [151]:
schizophrenia_PostCovid = get_reddit_submissions('schizophrenia', 5000)

Total:: Success Rate: 75.76% - Requests: 66 - Batches: 7 - Items Remaining: 0


In [152]:
schizophrenia_PostCovid.shape 

(5000, 82)

##### Save columns of interest

In [153]:
schizophrenia_PostCovid = schizophrenia_PostCovid[['author', 
                                                   'title', 
                                                   'selftext',
                                                   'created_utc',
                                                   'subreddit']]

#### Combine title with self text column

In [155]:
schizophrenia_PostCovid['text'] = schizophrenia_PostCovid[['title',
                                                           'selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
schizophrenia_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [156]:
drop_rows = schizophrenia_PostCovid.loc[schizophrenia_PostCovid['author'] == '[deleted]'].index

In [157]:
# Replace 'deleted' and 'removed' with null and then re
schizophrenia_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [158]:
schizophrenia_PostCovid = schizophrenia_PostCovid.drop_duplicates(subset='text', keep="first")

In [160]:
schizophrenia_PostCovid.shape

(4773, 4)

### r/autism

In [161]:
autism_PostCovid = get_reddit_submissions('autism', 5000)

Total:: Success Rate: 96.15% - Requests: 52 - Batches: 6 - Items Remaining: 0


In [162]:
autism_PostCovid.shape

(5000, 82)

##### Save columns of interest

In [163]:
autism_PostCovid = autism_PostCovid[['author', 
                                     'title', 
                                     'selftext',
                                     'created_utc', 
                                     'subreddit']]

#### Combine title with self text column

In [167]:
autism_PostCovid['text'] = autism_PostCovid[['title','selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
autism_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [179]:
drop_rows = autism_PostCovid.loc[autism_PostCovid['author'] == '[deleted]'].index

In [180]:
# Replace 'deleted' and 'removed' with null and then re
autism_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [181]:
autism_PostCovid  = autism_PostCovid.drop_duplicates(subset='text', keep="first")

In [182]:
autism_PostCovid .shape

(4597, 4)

### r/AnorexiaNervosa

In [173]:
anorexia_PostCovid = get_reddit_submissions('AnorexiaNervosa', 5000)

Total:: Success Rate: 100.00% - Requests: 51 - Batches: 6 - Items Remaining: 0


In [174]:
anorexia_PostCovid.shape

(5000, 81)

##### Save columns of interest

In [None]:
anorexia_PostCovid = anorexia_PostCovid[['author', 
                                         'title', 
                                         'selftext',
                                         'created_utc', 
                                         'subreddit']]

#### Combine title with self text column

In [None]:
anorexia_PostCovid['text'] = anorexia_PostCovid[['title','selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
anorexia_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [211]:
drop_rows = anorexia_PostCovid.loc[anorexia_PostCovid['author'] == '[deleted]'].index

In [212]:
# Replace 'deleted' and 'removed' with null and then re
anorexia_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [214]:
anorexia_PostCovid = anorexia_PostCovid.drop_duplicates(subset='text', keep="first")

In [215]:
anorexia_PostCovid.shape

(4757, 4)

### r/Bulimia

In [183]:
bulimia_PostCovid = get_reddit_submissions('Bulimia', 5000)

Total:: Success Rate: 100.00% - Requests: 65 - Batches: 7 - Items Remaining: 0


In [184]:
bulimia_PostCovid.shape

(5000, 84)

##### Save columns of interest

In [185]:
bulimia_PostCovid = bulimia_PostCovid[['author', 
                                       'title', 
                                       'selftext',
                                       'created_utc', 
                                       'subreddit']]

#### Combine title with self text column

In [186]:
bulimia_PostCovid['text'] = bulimia_PostCovid[['title','selftext']].stack().groupby(level=0).agg(' : '.join)

# Drop the columns we combined
bulimia_PostCovid.drop(columns = ['title','selftext'], inplace=True)

#### Remove any 'authors' or 'body' rows where it says removed or deleted

In [187]:
drop_rows = bulimia_PostCovid.loc[bulimia_PostCovid['author'] == '[deleted]'].index

In [188]:
# Replace 'deleted' and 'removed' with null and then re
bulimia_PostCovid.drop(drop_rows, inplace = True)

#### Drop duplicates

In [189]:
bulimia_PostCovid = bulimia_PostCovid.drop_duplicates(subset='text', keep="first")

In [190]:
bulimia_PostCovid.shape

(4871, 4)

### Combine data

#### Combine data to one large data frame

In [218]:
postcovid_frames = [bulimia_PostCovid, 
                   anorexia_PostCovid,
                   autism_PostCovid,
                   schizophrenia_PostCovid,
                   bpd_PostCovid,
                   bipolar_PostCovid,
                   anxiety_PostCovid,
                   depression_PostCovid,
                   mentalhealth_PostCovid]

In [219]:
PostCovid_subreddits = pd.concat(postcovid_frames)

#### Combine pre and post Covid data frames

In [225]:
# add column to each data frame saying 'pre or post covid'
PreCovid_subreddits['timeframe'] = 'pre-covid'
PostCovid_subreddits['timeframe'] = 'post-covid'

In [226]:
# Combine data frames
subreddit_posts_combined = pd.concat([PreCovid_subreddits, PostCovid_subreddits], axis=0)

#### Anonmyize


In [228]:
# Replace unique author id with integer.
subreddit_posts_combined['author'] = subreddit_posts_combined.assign(author=(np.unique(subreddit_posts_combined.author, 
                                          return_inverse=True)[1] + 1).astype(str))

In [229]:
# Add 'sub' infront of integer to remind ourselves that this is an id. 
subreddit_posts_combined['author'] = subreddit_posts_combined['author'].apply(lambda x: f"sub{x}")

#### Save to csv

In [230]:
# Save combined data
subreddit_posts_combined.to_csv('../data/subreddit_posts_combined.csv',
                                index=False,
                                columns=list(subreddit_posts_combined.axes[1]))