In [4]:
## imports
import json
import pandas as pd
import os
import requests
from datetime import datetime
import time

Below is a function that will pull either Comments or Posts/"submissions" from a specified Reddit, over a designated time period.

The function is heavily inspired by the [following post](https://www.reddit.com/r/pushshift/comments/89pxra/pushshift_api_with_large_amounts_of_data), but altered to be able to save the individual .json files to disc.

In [2]:
def getPushshiftData(sub=None, before=None, after=None, ids=None,getComments=False, getSubmissions=False):
    suffix=''
    norm_before=''
        
    if getComments:
        searchType='comment'
        
    if getSubmissions:
        searchType='submission'
        
    if (before is not None):
        suffix += '&before='+str(before)
        norm_before += str(datetime.utcfromtimestamp(int(before)).strftime('%Y%m%d'))
    if (after is not None):
        suffix += '&after='+str(after)
    if (sub is not None):
        suffix += '&subreddit='+sub
    if (ids is not None):
        suffix += '&ids='+','.join(ids)

    url = 'https://api.pushshift.io/reddit/search/'+searchType+'?sort=desc&size=1000'+suffix
    
    print('loading '+url)
    r = requests.get(url)
    data = json.loads(r.content)
    
    with open(f'../jsons/{searchType}_{norm_before}_{after}_{sub}.json', 'w') as outfile:
        json.dump(data, outfile)
    
    if len(data['data']) > 0:
        prev_end_date = data['data'][-1]['created_utc']
    else:
        prev_end_date = None
    return (data, prev_end_date)

Downloading all comments after 1/1/18 (unix timestamp: 1514764800) for r/TrueFilm

In [12]:
sub='TrueFilm'
(submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, after='1514764800',getComments=True)
submissions = submissions_tmp['data']
while prev_end_date is not None:
    (submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, before=prev_end_date-1, after='1514764800',getComments=True)
    if prev_end_date is not None:
        submissions.extend(submissions_tmp['data'])

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1544490685&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1544166181&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1543867550&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1543526126&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1543071841&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1542509260&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1542119017&after=1514764800&subre

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1515309678&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1514909924&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1514764838&after=1514764800&subreddit=TrueFilm


Downloading all posts/submissions after 1/1/18 (unix timestamp: 1514764800) for r/TrueFilm

In [13]:
sub='TrueFilm'
(submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, after='1514764800',getSubmissions=True)
submissions = submissions_tmp['data']
while prev_end_date is not None:
    (submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, before=prev_end_date-1, after='1514764800',getSubmissions=True)
    if prev_end_date is not None:
        submissions.extend(submissions_tmp['data'])

loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1000&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1000&before=1538115120&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1000&before=1531089562&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1000&before=1523302110&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1000&before=1516057152&after=1514764800&subreddit=TrueFilm
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1000&before=1514771886&after=1514764800&subreddit=TrueFilm


Downloading all comments after 1/1/18 (unix timestamp: 1514764800) for r/flicks

In [14]:
sub='Flicks'
(submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, after='1514764800',getComments=True)
submissions = submissions_tmp['data']
while prev_end_date is not None:
    (submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, before=prev_end_date-1, after='1514764800',getComments=True)
    if prev_end_date is not None:
        submissions.extend(submissions_tmp['data'])

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1543603770&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1542295359&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1541036336&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1539208429&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1537930762&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1537121171&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1536227393&after=1514764800&subreddit=Flicks
lo

Downloading all posts after 1/1/18 (unix timestamp: 1514764800) for r/flicks

In [5]:
sub='Flicks'
(submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, after='1514764800',getComments=True)
submissions = submissions_tmp['data']
while prev_end_date is not None:
    (submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, before=prev_end_date-1, after='1514764800',getComments=True)
    if prev_end_date is not None:
        submissions.extend(submissions_tmp['data'])

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1543611024&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1542301167&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1541107695&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1539262700&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1537950824&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1537134906&after=1514764800&subreddit=Flicks
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1000&before=1536254749&after=1514764800&subreddit=Flicks
lo