## Setup

In [1]:
import numpy as np
import pandas as pd
import requests
import time
import datetime as dt
import json

In [2]:
# ***to do***
# add functionality to not skip any posts in chronological order
# by matching 'after' to last post returned by each iteration of loop?

## Pushshift search function

In [3]:
# References used to create this code: 
# https://github.com/pushshift/api
# Brian Collins, General Assembly instructor

def pushshift(subreddit, post_type='submission', loops=1, size=500, skip=30):
# subreddit: str, name of subreddit to search for
# post_type: {'submission', 'comment'}, type of post to search for
# loops: int, number of times to request posts
# size: int, number of posts per request (max 500 per pushshift api guide)
# skip: int, number of days back to search in each loop 
        # increase if too many duplicate posts are returned, decrease if you want to skip fewer posts

    # data fields to return for submissions
    subfields = ['author', 'author_fullname', 'created_utc', 'id', 'num_comments', 'permalink', 
                 'score', 'selftext', 'subreddit', 'title', 'url', 'is_self']    
    # data fields to return for comments
    comfields = ['author', 'author_fullname', 'body', 'created_utc', 'id', 'parent_id', 
                'permalink', 'score', 'subreddit']
    # instantiate list for posts data
    list_posts = [] 
    url_stem = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size={}".format(post_type, subreddit, size)
    # skip a minimum of 1 day
    after = 1    

    # check before requesting data
    if post_type not in ['submission', 'comment']:
        print("post_type must be 'submission' or 'comment'")
        return None
    
    for i in range(loops):
        # add parameters to url to skip posts (after could be used to match up to post at end of previous loop if skip = 0)
        url = '{}&after={}d'.format(url_stem, skip * i + after) 
        # monitor status as loops run
        print(i, url)
        # get data
        res = requests.get(url)
        # add dictionaries for posts to list_posts
        list_posts.extend(res.json()['data']) 
        # be polite
        time.sleep(1) 

    # turn list_posts (a list of dictionaries where each dictionary contains data on one post) into a dataframe
    df_posts = pd.DataFrame.from_dict(list_posts) 

    # filter fields for submissions or comments
    if post_type == 'submission':
        df_posts = df_posts[subfields]
    elif post_type == 'comment':
        df_posts = df_posts[comfields]  
#     else:
#         print("post_type must be 'submission' or 'comment'")
#         return None

    # drop any duplicates
    df_posts.drop_duplicates(inplace=True)
    # add a field identifying submissions or comments
    df_posts['post_type'] = post_type
    
    return df_posts

## Get Reddit posts and save to csv's

In [4]:
cats_subs = pushshift('cats', post_type='submission', loops=20, skip=1)
print('shape', cats_subs.shape)
cats_subs.to_csv('cats_subs-pushshift.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=1d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=2d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=3d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=4d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=5d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=6d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=7d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=8d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=9d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=10d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=cats&size=500&after=11d
11 https://api.pushshift.io/reddit/search/submissio

In [5]:
dogs_subs = pushshift('dogs', post_type='submission', loops=21, skip=30)
print('shape', dogs_subs.shape)
dogs_subs.to_csv('dogs_subs-pushshift.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=1d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=31d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=61d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=91d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=121d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=151d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=181d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=211d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=241d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=271d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=dogs&size=500&after=301d
11 https://api.pushshift.io/reddit/s

In [6]:
cats_coms = pushshift('cats', post_type='comment', loops=20, skip=1)
print('shape', cats_coms.shape)
cats_coms.to_csv('cats_coms-pushshift.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=11d
11 https://api.pushshift.io/reddit/search/comment/?subreddit=cats&size=500&after=12d

In [7]:
dogs_coms = pushshift('dogs', post_type='comment', loops=20, skip=1)
print('shape', dogs_coms.shape)
dogs_coms.to_csv('dogs_coms-pushshift.csv')

0 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=1d
1 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=2d
2 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=3d
3 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=4d
4 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=5d
5 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=6d
6 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=7d
7 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=8d
8 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=9d
9 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=10d
10 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=11d
11 https://api.pushshift.io/reddit/search/comment/?subreddit=dogs&size=500&after=12d

## Create csv for analysis of comment body text only

In [8]:
df = pd.concat([cats_coms[['body', 'subreddit']], dogs_coms[['body', 'subreddit']]], ignore_index=True)
df.to_csv('comments.csv', index=False)