In [None]:
import math
import json
import requests
import itertools
import numpy as np
import time
from datetime import datetime, timedelta
import praw
import pandas as pd

In [None]:
def make_request(uri, max_retries = 5):
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)

In [None]:
def pull_posts_for(subreddit, start_at, end_at):
    
    def map_posts(posts):
        return list(map(lambda post: {
            'id': post['id'],
            'created_utc': post['created_utc'],
            'prefix': 't4_'
        }, posts))
    
    SIZE = 500
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
    
    post_collections = map_posts( \
        make_request( \
            URI_TEMPLATE.format( \
                subreddit, start_at, end_at, SIZE))['data'])
    n = len(post_collections)
    while n == SIZE:
        last = post_collections[-1]
        new_start_at = last['created_utc'] - (10)
        
        more_posts = map_posts( \
            make_request( \
                URI_TEMPLATE.format( \
                    subreddit, new_start_at, end_at, SIZE))['data'])
        
        n = len(more_posts)
        post_collections.extend(more_posts)

    return post_collections

In [None]:
def give_me_intervals(start_at, number_of_days_per_interval = 3):
    
    end_at = math.ceil(datetime.utcnow().timestamp())
        
    ## 1 day = 86400,
    period = (86400 * number_of_days_per_interval)
    end = start_at + period
    yield (int(start_at), int(end))
    padding = 1
    while end <= end_at:
        start_at = end + padding
        end = (start_at - padding) + period
        yield int(start_at), int(end)
## test out the solution,
start_at = math.floor(\
     (datetime.utcnow() - timedelta(days=365)).timestamp())
list(give_me_intervals(start_at, 7))

In [None]:
# Set Reddit API Credentials
reddit = praw.Reddit(
     client_id="3gRvZgbUzCz_Tg",
     client_secret="BuNkAnAhJFZhtDMw303NyxiIonIkpg",
     user_agent="image-scraper"
)

In [None]:
# Define the name of the directory to be created. Replace with your directory location.
csv_dir = '../Data/Reddit_Comments/'

# Define directory for parquet file.
parquet_dir = '../Data/Parquet/'

In [None]:
sub_list = ['Cryptocurrency', 'Altcoin', 'Bitcoin', 'Ethereum', 'BasicAttentionToken', 'Best_of_Crypto', 'BitcoinMarkets', 
            'Blockchain', 'CryptoMarkets', 'CryptoTechnology', 'CryptoTrade', 'Algorand', 'Tezos', 'cosmosnetwork',
            'Polkadot', 'Cardano', 'Ankr']

In [None]:
for x in sub_list:
    subreddit = x
    start_at = math.floor(\
    (datetime.utcnow() - timedelta(days=365)).timestamp())
    posts = []
    for interval in give_me_intervals(start_at, 7):
        pulled_posts = pull_posts_for(
            subreddit, interval[0], interval[1])
    
        posts.extend(pulled_posts)
        time.sleep(.500)

    # Pull reddit posts and comments
    TIMEOUT_AFTER_COMMENT_IN_SECS = .350
    posts_from_reddit = []
    comments_from_reddit = []
    for submission_id in np.unique([ post['id'] for post in posts ]):
        submission = reddit.submission(id=submission_id)
        posts_from_reddit.append(submission)
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            comments_from_reddit.append(comment)
        
            if TIMEOUT_AFTER_COMMENT_IN_SECS > 0:
                time.sleep(TIMEOUT_AFTER_COMMENT_IN_SECS)

    # Create DataFrame
    p = pd.DataFrame(posts_from_reddit)
    c = pd.DataFrame(comments_from_reddit)
    
    # Create date string for csv file name
    timestr = time.strftime("%Y%m%d")
    
    # Save dataframe to csv file
    p.to_csv(csv_dir + subreddit + 'posts_'  + timestr + '.csv', index = False)
    c.to_csv(csv_dir + subreddit + 'comments_'  + timestr + '.csv', index = False)

    # Save dataframe to parquet file
    p.to_parquet(parquet_dir + subreddit + 'posts_' + timestr + '.parquet', engine='fastparquet')
    c.to_parquet(parquet_dir + subreddit + 'comments_' + timestr + '.parquet', engine='fastparquet')

    time.sleep(5)

In [None]:
subreddit = 'Siacoin'
start_at = math.floor(\
    (datetime.utcnow() - timedelta(days=365)).timestamp())
posts = []
for interval in give_me_intervals(start_at, 7):
    pulled_posts = pull_posts_for(
        subreddit, interval[0], interval[1])
    
    posts.extend(pulled_posts)
    time.sleep(.500)
## ~ 4306
print(len(posts))
## ~ 4306
print(len(np.unique([ post['id'] for post in posts ])))

In [None]:
## WARNING: REDDIT WILL THROTTLE YOU IF YOU ARE ANNOYING! BE KIND!
TIMEOUT_AFTER_COMMENT_IN_SECS = .350
posts_from_reddit = []
comments_from_reddit = []
for submission_id in np.unique([ post['id'] for post in posts ]):
    submission = reddit.submission(id=submission_id)
    posts_from_reddit.append(submission)
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comments_from_reddit.append(comment)
        
        if TIMEOUT_AFTER_COMMENT_IN_SECS > 0:
            time.sleep(TIMEOUT_AFTER_COMMENT_IN_SECS)
## ~ 4306
print(len(posts_from_reddit))
## ~ 35216
print(len(comments_from_reddit))

In [None]:
# Create dataframe
p = pd.DataFrame(posts_from_reddit)
c = pd.DataFrame(comments_from_reddit)
    
# Create date string for csv file name
timestr = time.strftime("%Y%m%d")
    
# Save dataframe to csv file
p.to_csv(csv_dir + subreddit + 'posts_' + timestr + '.csv', index = False)
c.to_csv(csv_dir + subreddit + 'comments_' + timestr + '.csv', index = False)

# Save dataframe to parquet file
#p.to_parquet(parquet_dir + subreddit + '_' + timestr + '.parquet', engine='fastparquet')
#c.to_parquet(parquet_dir + subreddit + '_' + timestr + '.parquet', engine='fastparquet')