In [9]:
import datetime as dt
import requests
import pandas as pd
import csv

In [10]:
def extract(json, out_dict):
    """ inputs:
            json: a json file retrieved from the pushshift.io api
            out_dict: a dictionary containing at least the six keys called below
        output:
            none, the input dictionary is modified in place, receiving the 
                desired information from the json file
    """
    for item in json['data']:
        out_dict['title'].append(item['title'])
        out_dict['score'].append(item['score'])
        out_dict['id'].append(item['id'])
        out_dict['comments'].append(item['num_comments'])
        out_dict['created'].append(item['created_utc'])
        try:
            out_dict['body'].append(item['selftext'])
        except KeyError:
            out_dict['body'].append('')

In [11]:
def scrape_news(out_dict, end, iterations, interval, interval_divisor):
    """ inputs:
            out_dict: a dictionary containing at least the six keys called in 'extract(json, outdict)'
                **will be modified to contain all the information scraped
            end: a datetime object delineating the latest day in time to be scraped
            iterations: the number of steps back in time to take
            interval: a timedelta object representing the step size covered by each iteration
            interval_divisor: integer describing the number of chunks into which to split any time 
                interval during which more than 500 mentions of bitcoin occur
        output:
            error_log: a list of urls who's json files weren't retrieved
    """
    error_log = []
    for i in range(iterations):
        begin = end - interval

        url = 'https://api.pushshift.io/reddit/search/submission/?q=bitcoin&subreddit=news&score=>2&size=500'
        url += '&before={}'.format(int(end.timestamp()))
        url += '&after={}'.format(int(begin.timestamp()))

        try:
            json = requests.get(url).json()
            
            if len(json['data']) == 500:
                print(end)
                error_log.extend(scrape_news(out_dict, 
                                    end, 
                                    interval_divisor, 
                                    (interval/interval_divisor), 
                                    interval_divisor))
            else:
                extract(json, out_dict)
                
        except ValueError:
            error_log.append(url)
                
        end = begin
        
    return error_log

In [12]:
data_dict = {"title":[],
                "score":[],
                "id":[],
                "comments": [],
                "created": [],
                "body":[]}

end = dt.datetime.today()

# can be tuned to speed up the process depending on how often the query is mentioned in the news subreddit
interval = dt.timedelta(weeks=52)

interval_divisor = 2

iterations = 10

error_log = scrape_news(data_dict, end, iterations, interval, interval_divisor)

In [17]:
# checking that we have the same number of items for each field
for item in data_dict:
    print(item, len(data_dict[item]))

title 763
score 763
id 763
comments 763
created 763
body 763


In [18]:
print(len(error_log))

# adding the info from the urls in the error log to the dictionary
for item in error_log:
    json = requests.get(item).json()
    extract(json, data_dict)

0


In [15]:
count = 0
for i in range(len(data_dict['title'])):
    if data_dict['score'][i] > 2:
        count += 1
print(count)

763


In [16]:
# writing out the dictionary to a csv
data = pd.DataFrame(data_dict)

data.to_csv('../../data/reddit_news.csv', index=False)