In [1]:
import datetime as dt
import requests
import pandas as pd
import csv

ModuleNotFoundError: No module named 'tensorflow'

In [1]:
def extract(json, out_dict):
    """ inputs:
            json: a json file retrieved from the pushshift.io api
            out_dict: a dictionary containing at least the six keys called below
        output:
            none, the input dictionary is modified in place, receiving the 
                desired information from the json file
    """
    for item in json['data']:
        out_dict['title'].append(item['title'])
        out_dict['score'].append(item['score'])
        out_dict['id'].append(item['id'])
        out_dict['comments'].append(item['num_comments'])
        out_dict['created'].append(item['created_utc'])
        try:
            out_dict['body'].append(item['selftext'])
        except KeyError:
            out_dict['body'].append('')

In [2]:
def scrape(out_dict, end, iterations, interval, interval_divisor):
    """ inputs:
            out_dict: a dictionary containing at least the six keys called in 'extract(json, outdict)'
                **will be modified to contain all the information scraped
            end: a datetime object delineating the latest day in time to be scraped
            iterations: the number of steps back in time to take
            interval: a timedelta object representing the step size covered by each iteration
            interval_divisor: integer describing the number of chunks into which to split any time 
                interval during which more than 500 mentions of cryptocurrency occur into
        output:
            error_log: a list of urls who's json files weren't retrieved
    """
    error_log = []
    for i in range(iterations):
        begin = end - interval

        url = 'https://api.pushshift.io/reddit/search/submission/?q=cryptocurrency&size=500'
        url += '&before={}'.format(int(end.timestamp()))
        url += '&after={}'.format(int(begin.timestamp()))

        try:
            json = requests.get(url).json()
            
        except ValueError:
            try:
                json = requests.get(url).json()
                
            except ValueError:
                error_log.append(url)
                
        # this api returns a maximum of 500 entries at a time, so if our json is that large,
        # we need to iterate over that interval in smaller steps
        if len(json['data']) == 500:
            print(end)
            
            error_log.extend(scrape(out_dict, 
                                    end, 
                                    interval_divisor, 
                                    (interval/interval_divisor), 
                                    interval_divisor))

        else:
          extract(json, out_dict)

        end = begin
        
    return error_log

In [3]:
data_dict = {"title":[],
                "score":[],
                "id":[],
                "comments": [],
                "created": [],
                "body":[]}

end = dt.datetime.today()

# can be tuned to speed up the process depending on how often the query is mentioned on reddit
interval = dt.timedelta(hours=24)

interval_divisor = 2

# this number depends on how far back in time you want to scrape
iterations = 3265

error_log = scrape(data_dict, end, iterations, interval, interval_divisor)

Iteration:  0
2019-07-25 13:02:50.218282
Iteration:  0
2019-07-24 13:02:50.218282
Iteration:  0
2019-07-23 13:02:50.218282
Iteration:  0
2019-07-22 13:02:50.218282
Iteration:  0
2019-07-19 13:02:50.218282
Iteration:  0
2019-07-18 13:02:50.218282
Iteration:  0
2019-07-17 13:02:50.218282
Iteration:  0
2019-07-17 13:02:50.218282
Iteration:  0
2019-07-16 13:02:50.218282
Iteration:  0
2019-07-16 13:02:50.218282
Iteration:  0
2019-07-15 13:02:50.218282
Iteration:  0
2019-07-15 13:02:50.218282
Iteration:  0
2019-07-14 13:02:50.218282
Iteration:  0
2019-07-13 13:02:50.218282
Iteration:  0
2019-07-12 13:02:50.218282
Iteration:  0
2019-07-11 13:02:50.218282
Iteration:  0
2019-07-10 13:02:50.218282
Iteration:  0
2019-07-09 13:02:50.218282
Iteration:  0
2019-07-08 13:02:50.218282
Iteration:  0
2019-07-08 13:02:50.218282
Iteration:  0
2019-07-07 13:02:50.218282
Iteration:  0
2019-07-06 13:02:50.218282
Iteration:  0
2019-07-05 13:02:50.218282
Iteration:  0
2019-07-04 13:02:50.218282
Iteration:  0
20

2018-12-12 13:02:50.218282
Iteration:  0
2018-12-11 13:02:50.218282
Iteration:  0
2018-12-11 13:02:50.218282
Iteration:  0
2018-12-10 13:02:50.218282
Iteration:  0
2018-12-10 13:02:50.218282
Iteration:  0
2018-12-09 13:02:50.218282
Iteration:  0
2018-12-08 13:02:50.218282
Iteration:  0
2018-12-07 13:02:50.218282
Iteration:  0
2018-12-07 13:02:50.218282
Iteration:  0
2018-12-06 13:02:50.218282
Iteration:  0
2018-12-06 13:02:50.218282
Iteration:  0
2018-12-05 13:02:50.218282
Iteration:  0
2018-12-05 13:02:50.218282
Iteration:  0
2018-12-04 13:02:50.218282
Iteration:  0
2018-12-04 13:02:50.218282
Iteration:  0
2018-12-03 13:02:50.218282
Iteration:  0
2018-12-03 13:02:50.218282
Iteration:  0
2018-12-02 13:02:50.218282
Iteration:  0
2018-12-01 13:02:50.218282
Iteration:  0
2018-11-30 13:02:50.218282
Iteration:  0
2018-11-30 13:02:50.218282
Iteration:  0
2018-11-29 13:02:50.218282
Iteration:  0
2018-11-29 13:02:50.218282
Iteration:  0
2018-11-28 13:02:50.218282
Iteration:  0
2018-11-28 13:02

2018-08-27 13:02:50.218282
Iteration:  0
2018-08-27 13:02:50.218282
Iteration:  0
2018-08-26 13:02:50.218282
Iteration:  0
2018-08-25 13:02:50.218282
Iteration:  0
2018-08-24 13:02:50.218282
Iteration:  0
2018-08-24 13:02:50.218282
Iteration:  0
2018-08-23 13:02:50.218282
Iteration:  0
2018-08-23 13:02:50.218282
Iteration:  0
2018-08-22 13:02:50.218282
Iteration:  0
2018-08-22 13:02:50.218282
Iteration:  0
2018-08-21 13:02:50.218282
Iteration:  0
2018-08-21 13:02:50.218282
Iteration:  0
2018-08-20 13:02:50.218282
Iteration:  0
2018-08-20 13:02:50.218282
Iteration:  0
2018-08-19 13:02:50.218282
Iteration:  0
2018-08-18 13:02:50.218282
Iteration:  0
2018-08-17 13:02:50.218282
Iteration:  0
2018-08-16 13:02:50.218282
Iteration:  0
2018-08-16 13:02:50.218282
Iteration:  0
2018-08-15 13:02:50.218282
Iteration:  0
2018-08-15 13:02:50.218282
Iteration:  0
2018-08-14 13:02:50.218282
Iteration:  0
2018-08-14 13:02:50.218282
Iteration:  0
2018-08-13 13:02:50.218282
Iteration:  0
2018-08-13 13:02

2018-05-09 13:02:50.218282
Iteration:  0
2018-05-09 13:02:50.218282
Iteration:  0
2018-05-09 01:02:50.218282
Iteration:  0
2018-05-08 13:02:50.218282
Iteration:  0
2018-05-08 13:02:50.218282
Iteration:  0
2018-05-07 13:02:50.218282
Iteration:  0
2018-05-07 13:02:50.218282
Iteration:  0
2018-05-06 13:02:50.218282
Iteration:  0
2018-05-05 13:02:50.218282
Iteration:  0
2018-05-04 13:02:50.218282
Iteration:  0
2018-05-04 13:02:50.218282
Iteration:  0
2018-05-03 13:02:50.218282
Iteration:  0
2018-05-02 13:02:50.218282
Iteration:  0
2018-05-02 13:02:50.218282
Iteration:  0
2018-05-01 13:02:50.218282
Iteration:  0
2018-05-01 13:02:50.218282
Iteration:  0
2018-04-30 13:02:50.218282
Iteration:  0
2018-04-30 13:02:50.218282
Iteration:  0
2018-04-29 13:02:50.218282
Iteration:  0
2018-04-28 13:02:50.218282
Iteration:  0
2018-04-27 13:02:50.218282
Iteration:  0
2018-04-26 13:02:50.218282
Iteration:  0
2018-04-26 13:02:50.218282
Iteration:  0
2018-04-25 13:02:50.218282
Iteration:  0
2018-04-25 13:02

2018-01-29 13:02:50.218282
Iteration:  0
2018-01-29 13:02:50.218282
Iteration:  0
2018-01-28 13:02:50.218282
Iteration:  0
2018-01-27 13:02:50.218282
Iteration:  0
2018-01-27 13:02:50.218282
Iteration:  0
2018-01-27 01:02:50.218282
Iteration:  0
2018-01-26 13:02:50.218282
Iteration:  0
2018-01-26 13:02:50.218282
Iteration:  0
2018-01-26 01:02:50.218282
Iteration:  0
2018-01-25 13:02:50.218282
Iteration:  0
2018-01-25 13:02:50.218282
Iteration:  0
2018-01-25 01:02:50.218282
Iteration:  0
2018-01-24 13:02:50.218282
Iteration:  0
2018-01-24 13:02:50.218282
Iteration:  0
2018-01-23 13:02:50.218282
Iteration:  0
2018-01-23 13:02:50.218282
Iteration:  0
2018-01-22 13:02:50.218282
Iteration:  0
2018-01-22 13:02:50.218282
Iteration:  0
2018-01-21 13:02:50.218282
Iteration:  0
2018-01-21 13:02:50.218282
Iteration:  0
2018-01-20 13:02:50.218282
Iteration:  0
2018-01-19 13:02:50.218282
Iteration:  0
2018-01-19 13:02:50.218282
Iteration:  0
2018-01-19 01:02:50.218282
Iteration:  0
2018-01-18 13:02

In [4]:
# checking that we have the same number of items for each field
for item in data_dict:
    print(item, len(data_dict[item]))

title 598247
score 598247
id 598247
comments 598247
created 598247
body 598247


In [5]:
print(len(error_log))

# adding the info from the urls in the error log to the dictionary
for item in error_log:
    json = requests.get(item).json()
    extract(json, data_dict)

0


In [39]:
# writing out the dictionary to a csv
data = pd.DataFrame(data_dict)

data.to_csv('reddit_since10.csv', index=False)

**News from Reddit:**

In [10]:
def scrape_news(out_dict, end, iterations, interval, interval_divisor):
    error_log = []
    for i in range(iterations):
        begin = end - interval

        url = 'https://api.pushshift.io/reddit/search/submission/?q=bitcoin&subreddit=news&score=>2&size=500'
        url += '&before={}'.format(int(end.timestamp()))
        url += '&after={}'.format(int(begin.timestamp()))

        try:
            json = requests.get(url).json()
            
        except ValueError:
            try:
                json = requests.get(url).json()
                
            except ValueError:
                error_log.append(url)
                

        if len(json['data']) == 500:
            print(end)
            scrape_news(out_dict, end, interval_divisor, (interval/interval_divisor), interval_divisor)

        else:
          extract(json, out_dict)

        end = begin
        
    return error_log

In [11]:
news_dict = {"title":[],
                "score":[],
                "id":[],
                "comments": [],
                "created": [],
                "body":[]}

end = dt.datetime.today()

interval = dt.timedelta(weeks=52)

interval_divisor = 2

iterations = 10

error_log = scrape_news(news_dict, end, iterations, interval, interval_divisor)

data = pd.DataFrame(news_dict)

data.to_csv('reddit_news.csv', index=False)

Iteration:  0


In [12]:
print(len(news_dict['title']))

770
