In [1]:
import datetime as dt
import requests
import pandas as pd
import csv

In [2]:
def extract(json, out_dict):
    """ inputs:
            json: a json file retrieved from the pushshift.io api
            out_dict: a dictionary containing at least the six keys called below
        output:
            none, the input dictionary is modified in place, receiving the 
                desired information from the json file
    """
    for item in json['data']:
        out_dict['title'].append(item['title'])
        out_dict['score'].append(item['score'])
        out_dict['id'].append(item['id'])
        out_dict['comments'].append(item['num_comments'])
        out_dict['created'].append(item['created_utc'])
        try:
            out_dict['body'].append(item['selftext'])
        except KeyError:
            out_dict['body'].append('')

In [3]:
def scrape(out_dict, end, iterations, interval, interval_divisor):
    """ inputs:
            out_dict: a dictionary containing at least the six keys called in 'extract(json, outdict)'
                **will be modified to contain all the information scraped
            end: a datetime object delineating the latest day in time to be scraped
            iterations: the number of steps back in time to take
            interval: a timedelta object representing the step size covered by each iteration
            interval_divisor: integer describing the number of chunks into which to split any time 
                interval during which more than 500 mentions of cryptocurrency occur into
        output:
            error_log: a list of urls who's json files weren't retrieved
    """
    error_log = []
    for i in range(iterations):
        begin = end - interval

        url = 'https://api.pushshift.io/reddit/search/submission/?q=cryptocurrency&size=500'
        url += '&before={}'.format(int(end.timestamp()))
        url += '&after={}'.format(int(begin.timestamp()))

        try:
            json = requests.get(url).json()
            
        except ValueError:
            try:
                json = requests.get(url).json()
                
            except ValueError:
                error_log.append(url)
                
        # this api returns a maximum of 500 entries at a time, so if our json is that large,
        # we need to iterate over that interval in smaller steps
        if len(json['data']) == 500:
            print(end)
            
            error_log.extend(scrape(out_dict, 
                                    end, 
                                    interval_divisor, 
                                    (interval/interval_divisor), 
                                    interval_divisor))

        else:
          extract(json, out_dict)

        end = begin
        
    return error_log

In [4]:
data_dict = {"title":[],
                "score":[],
                "id":[],
                "comments": [],
                "created": [],
                "body":[]}

end = dt.datetime.today()

# can be tuned to speed up the process depending on how often the query is mentioned on reddit
interval = dt.timedelta(hours=24)

interval_divisor = 2

# this number depends on how far back in time you want to scrape
iterations = 3265

error_log = scrape(data_dict, end, iterations, interval, interval_divisor)

2019-07-31 15:28:51.966577
2019-07-30 15:28:51.966577
2019-07-29 15:28:51.966577
2019-07-28 15:28:51.966577


KeyboardInterrupt: 

In [4]:
# checking that we have the same number of items for each field
for item in data_dict:
    print(item, len(data_dict[item]))

title 598247
score 598247
id 598247
comments 598247
created 598247
body 598247


In [5]:
print(len(error_log))

# adding the info from the urls in the error log to the dictionary
for item in error_log:
    json = requests.get(item).json()
    extract(json, data_dict)

0


In [39]:
# writing out the dictionary to a csv
data = pd.DataFrame(data_dict)

data.to_csv('../data/reddit.csv', index=False)