# Pulling posts via the PushShift.io API

In [1]:
import requests
import time
import pandas as pd
import json
import pprint
pp = pprint.PrettyPrinter(indent=4)

### A Github user had created a python wrapper for the API, PSAW

https://github.com/dmarx/psaw

In [2]:
from psaw import PushshiftAPI

api = PushshiftAPI()

In [3]:
gen = api.search_submissions(limit=3000)
results = list(gen)

Using the PushShift and psaw documentation, I tried to create a simple function to pull posts starting from a given date.

In [4]:
import datetime as dt

start_epoch=int(dt.datetime(2018, 7, 1).timestamp())

news = list(api.search_submissions(after=start_epoch,
                            subreddit='news',
                            filter=['url','author', 'title', 'subreddit', 'id'],
                            limit=10))

While reseraching PushShift, I found a more involved function that allowed for more customization, which I am shamelessly using due to being more effective than the simple function.

Credit: https://www.reddit.com/r/pushshift/comments/89pxra/pushshift_api_with_large_amounts_of_data/

In [5]:
def getPushshiftData(sub=None, before=None, after=None, ids=None, getSubmissions=True, getComments=False):
    suffix=''
    searchType = 'submission'
    if getComments or not getSubmissions:
        searchType='comment'
    if (before is not None):
        suffix += '&before='+str(before)
    if (after is not None):
        suffix += '&after='+str(after)
    if (sub is not None):
        suffix += '&subreddit='+sub
    if (ids is not None):
        suffix += '&ids='+','.join(ids)

    url = 'https://api.pushshift.io/reddit/search/'+searchType+'?sort=desc&size=1500'+suffix
    print('loading '+url)
    r = requests.get(url)
    data = json.loads(r.content)
    if len(data['data']) > 0:
        prev_end_date = data['data'][-1]['created_utc']
    else:
        prev_end_date = None
    return (data, prev_end_date)

In [6]:
sub='news'
(submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, after='60d')
submissions = submissions_tmp['data']

loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&after=60d&subreddit=news


In [7]:
while prev_end_date is not None:
    (submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, before=prev_end_date-1, after='1d')
    if prev_end_date is not None:
        submissions.extend(submissions_tmp['data'])
        
# In the interest of not having 100 lines of "loading" below, I've modified this cell to only pull posts from 1 day.
# The data used in this project was pulled from 60 days.

loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1536553451&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1536525316&after=1d&subreddit=news


In [8]:
(comments_tmp, prev_end_date) = getPushshiftData(sub=sub, after='60d', getComments=True)
comments = comments_tmp['data']

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&after=60d&subreddit=news


In [9]:
while prev_end_date is not None:
    (comments_tmp, prev_end_date) = getPushshiftData(sub=sub, before=prev_end_date-1, after='1d', getComments=True)
    if prev_end_date is not None:
        comments.extend(comments_tmp['data'])
        
# In the interest of not having 100 lines of "loading" below, I've modified this cell to only pull posts from 1 day.
# The data used in this project was pulled from 60 days.

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536608355&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536604640&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536600030&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536595812&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536593690&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536591282&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536588154&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536583844&after=1d&subreddit=news
loading https://api.pushshift.io/reddit/search/comment?sort=desc

Save the raw posts and comments as json files, then repeat the same process for the next subreddit.

In [10]:
with open('ps_news_1', 'w+') as f:
    json.dump(comments, f)

In [11]:
with open('ps_news_posts_1', 'w+') as f:
    json.dump(submissions, f)

In [12]:
sub='upliftingnews'
(submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, after='1d')
submissions_up = submissions_tmp['data']
while prev_end_date is not None:
    (submissions_tmp, prev_end_date) = getPushshiftData(sub=sub, before=prev_end_date-1, after='1d')
    if prev_end_date is not None:
        submissions_up.extend(submissions_tmp['data'])

loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&after=1d&subreddit=upliftingnews
loading https://api.pushshift.io/reddit/search/submission?sort=desc&size=1500&before=1536527064&after=1d&subreddit=upliftingnews


In [13]:
(comments_tmp, prev_end_date) = getPushshiftData(sub=sub, after='1d', getComments=True)
comments_up = comments_tmp['data']
while prev_end_date is not None:
    (comments_tmp, prev_end_date) = getPushshiftData(sub=sub, before=prev_end_date-1, after='1d', getComments=True)
    if prev_end_date is not None:
        comments_up.extend(comments_tmp['data'])

loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&after=1d&subreddit=upliftingnews
loading https://api.pushshift.io/reddit/search/comment?sort=desc&size=1500&before=1536525924&after=1d&subreddit=upliftingnews


In [14]:
with open('ps_upnews_2', 'w+') as f:
    json.dump(comments_up, f)

In [15]:
with open('ps_upnews_posts_2', 'w+') as f:
    json.dump(submissions_up, f)

Checking the length of the data that was pulled, as well as checking for duplicates.

In [16]:
len(submissions)

1263

In [17]:
len(comments)

20801

In [18]:
len(set([p['title'] for p in submissions]))

1225

In [19]:
len(submissions_up)

71

In [20]:
len(comments_up)

797

In [21]:
len(set([p['title'] for p in submissions_up]))

69

There are roughly 80k posts and 250k comments for r/news and only 7k posts and 180k comments for r/upliftingnews. This dataset will certainly be unbalanced if using the pulled data as-is.