# Using Push Shift API to Scrape Data off of r/parenting and r/pregnant

In [1]:
#Imports
import requests
import pandas as pd 
import numpy as np
import time 

## Initial Scrape for Posts

In [2]:
#currently just all reddit submissions need to narrow down to specific subreddit
api_url = 'https://api.pushshift.io/reddit/search/submission' 

In [3]:
#can map this to all of the parameters in the documentation for pushshift api
#updated with size now
#updated with time stamp now
params = {
    'subreddit' : 'pregnant',
    'size' : 500,
    'before' : 1635633944
}

In [4]:
#check that requests went through
res = requests.get(api_url, params)
res.status_code

200

In [5]:
data = res.json()
posts = data['data']

In [6]:
df = pd.DataFrame(posts)
df.head(3)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,whitelist_status,wls,author_flair_template_id,author_flair_text_color,crosspost_parent,crosspost_parent_list,url_overridden_by_dest,post_hint,preview,suggested_sort
0,[],False,401RG,,[],,text,t2_mbjf99i,False,False,...,no_ads,0.0,,,,,,,,
1,[],False,antkcia,,[],,text,t2_4zoalkas,False,False,...,no_ads,0.0,,,,,,,,
2,[],False,xoxoskylor,,[],,text,t2_12r8fegt,False,False,...,no_ads,0.0,,,,,,,,


In [7]:
df[['selftext']].head(3)

Unnamed: 0,selftext
0,39w1d. He’s been VERY active lately. Has recen...
1,How much is the norm for newborn studio posed ...
2,***Added trigger warning just in case***\n\nHe...


## Important columns for use in function and/or in data analysis and modeling:
* author (author of the post)
* subreddit (subreddit the posts were scraped from) 
    * will be used for data check and classification modeling
* title (title of the post)
    * use as additional text data if needed
* selftext (the text of the post)
    * main avenue of text data
* created_utc (unix time code or the time that the post was posted to reddit)
    * needed for the Pushshift function to ensure that each consecutive scrape would attempt to pull new data based on pulling from before the lowest utc from the previous loop.

## Pushshift API Function

In [8]:
#reddit url and params set to scrape 
#function modified from code that Jeffrey Floyd Sent to assist with scraping data 
#Mark Harris also assisted with my understanding of the errors in my initial webscrape attempts that were not consistently pulling unique posts
def get_reddit_submissions(subreddit, size = 100, before = None):
    #api_url or the url used to scrape data off o
    api_url = 'https://api.pushshift.io/reddit/search/submission' 
    params = {
        'subreddit' : subreddit,
        'size' : size,
        'before' : before
    }
    
    res  = requests.get(url= api_url, params= params)
    data = res.json()
    
    posts = data['data']
    
    df = pd.DataFrame(posts)
    
    return df[['subreddit', 'title', 'selftext', 'author', 'created_utc']]
    

In [9]:
#function modified from code that Jeffrey Floyd Sent to assist me in this project
def bulk_scrape(sub, loop = 10, utc = None):
    
    bulk_df = get_reddit_submissions(subreddit= sub, before= utc)
    
    utc_1 = bulk_df['created_utc'].min()
    
    for i in range(loop-1):
        temp_df = get_reddit_submissions(subreddit= sub, before= utc_1)
        
        utc_1 = temp_df['created_utc'].min()
        
        bulk_df = pd.concat([bulk_df, temp_df], axis= 0)
        
        time.sleep(2)
        
        print(f'{sub} progress is {int((i +1)/loop * 100)}%', end ='\r', flush= True)
        
    print(f'r/{sub} data collection has completed! Size is {bulk_df.shape}')
    return bulk_df

## Pushshift API Function Results

### r/pregnant Posts (or submissions)

In [92]:
preg_sub_df = bulk_scrape(sub= 'pregnant', loop = 1000, utc= '1635641743')

r/pregnant data collection has completed! Size is (99978, 5)


#### r/parenting Posts (or submissions

In [101]:
parent_sub_df = bulk_scrape(sub= 'parenting', loop= 1000, utc= '1635641743')

r/parenting data collection has completed! Size is (99988, 5)
