In [4]:
import pandas as pd
import requests
import numpy as np
import time
from datetime import datetime, timedelta
import json 

In [5]:
sub = 'rockets'
size = '500'

In [6]:
url = f'https://api.pushshift.io/reddit/search/submission?subreddit={sub}&size={size}'

In [7]:
r = requests.get(url)
r.status_code

200

In [8]:
# looking at len of data
data = r.json()

In [9]:
# looking at data fields to grab 

data['data'][0] 

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'demon1212',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_7is9z',
 'author_patreon_flair': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1571408845,
 'domain': 'i.redd.it',
 'full_link': 'https://www.reddit.com/r/rockets/comments/djo9jy/there_are_some_gms_that_dont_think_hardens_the/',
 'gildings': {},
 'id': 'djo9jy',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': True,
 'is_robot_indexable': True,
 'is_self': False,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 3,
 'num_crossposts': 0,
 'over_18': False,
 'parent_whitelist_status': 'all_ads',
 'permali

In [11]:
# Fucntion to pull reddit posts using Pushshitft API
# Arguments:
# subreddit: str, name of subreddit to search for
# post_type: {'submission', 'comment'}, type of post to search for
# loops: int, number of times to request posts
# size: int, number of posts per request 
# skip: int, number of days back to search in each loop

def pushshift(subreddit, post_type='submission', loops=1, size=500, skip=30):


    # data fields to return for submissions
    subfields = ['author', 'author_fullname', 'created_utc', 'id', 'num_comments', 'permalink', 
                 'score', 'selftext', 'subreddit', 'title', 'url', 'is_self']    
    # data fields to return for comments
    comfields = ['author', 'author_fullname', 'body', 'created_utc', 'id', 'parent_id', 
                'permalink', 'score', 'subreddit']
    # instantiate list for posts data
    list_posts = [] 
    url_stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size={}".format(post_type, subreddit, size)
    # skip a minimum of 1 day
    after = 1    

    # check before requesting data
    if post_type not in ['submission', 'comment']:
        print("post_type must be 'submission' or 'comment'")
        return None
    
    for i in range(loops):
        # add parameters to url to skip posts
        url = '{}&after={}d'.format(url_stem, skip * i + after) 
        # monitor status as loops run
        print(i, url)
        # get data
        res = requests.get(url)
        # add dictionaries for posts to list_posts
        list_posts.extend(res.json()['data']) 
        # sleep
        time.sleep(1) 

    # turn list_posts into a dataframe
    df_posts = pd.DataFrame.from_dict(list_posts) 

    # filter fields for submissions or comments
    if post_type == 'submission':
        df_posts = df_posts[subfields]
    elif post_type == 'comment':
        df_posts = df_posts[comfields]  


    # drop any duplicates
    df_posts.drop_duplicates(inplace=True)
    # add a field identifying submissions or comments
    df_posts['post_type'] = post_type
    
    return df_posts

In [45]:
rockets_subs = pushshift('rockets', post_type='submission', loops=20, skip=1)
print('shape', rockets_subs.shape)
rockets_subs.to_csv('rockets_subs-pushshift.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=1d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=2d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=3d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=4d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=5d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=6d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=7d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=8d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=9d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=10d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=rockets&size=500&after=11d
11 https://api.pus

In [46]:
nba_subs = pushshift('nba', post_type='submission', loops=20, skip=1)
print('shape', nba_subs.shape)
nba_subs.to_csv('nba_subs-pushshift.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=1d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=2d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=3d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=4d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=5d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=6d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=7d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=8d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=9d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=10d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=nba&size=500&after=11d
11 https://api.pushshift.io/reddit/search/submission/?subreddi

In [47]:
rockets_coms = pushshift('rockets', post_type='comment', loops=20, skip=1)
print('shape', rockets_coms.shape)
rockets_coms.to_csv('rockets_coms-pushshift.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=rockets&size=500&after=11d
11 https://api.pushshift.io/reddit/search/comment/?

In [48]:
nba_coms = pushshift('nba', post_type='comment', loops=20, skip=1)
print('shape', nba_coms.shape)
nba_coms.to_csv('nba_coms-pushshift.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=11d
11 https://api.pushshift.io/reddit/search/comment/?subreddit=nba&size=500&after=12d
12 https://

In [49]:
df = pd.concat([rockets_coms[['body', 'subreddit']], nba_coms[['body', 'subreddit']]], ignore_index=True)
df.to_csv('comments.csv', index=False)