## Data Collection from Reddit Cryptocurrency Subreddits

https://medium.com/@pasdan/how-to-scrap-reddit-using-pushshift-io-via-python-a3ebcc9b83f4

In [2]:
import math
import json
import requests
import itertools
import numpy as np
import time
from datetime import datetime, timedelta
import praw

In [3]:
def make_request(uri, max_retries = 5):
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)

In [4]:
def pull_posts_for(subreddit, start_at, end_at):
    
    def map_posts(posts):
        return list(map(lambda post: {
            'id': post['id'],
            'created_utc': post['created_utc'],
            'prefix': 't4_'
        }, posts))
    
    SIZE = 500
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
    
    post_collections = map_posts( \
        make_request( \
            URI_TEMPLATE.format( \
                subreddit, start_at, end_at, SIZE))['data'])
    n = len(post_collections)
    while n == SIZE:
        last = post_collections[-1]
        new_start_at = last['created_utc'] - (10)
        
        more_posts = map_posts( \
            make_request( \
                URI_TEMPLATE.format( \
                    subreddit, new_start_at, end_at, SIZE))['data'])
        
        n = len(more_posts)
        post_collections.extend(more_posts)

    return post_collections

In [5]:
def give_me_intervals(start_at, number_of_days_per_interval = 3):
    
    end_at = math.ceil(datetime.utcnow().timestamp())
        
    ## 1 day = 86400,
    period = (86400 * number_of_days_per_interval)
    end = start_at + period
    yield (int(start_at), int(end))
    padding = 1
    while end <= end_at:
        start_at = end + padding
        end = (start_at - padding) + period
        yield int(start_at), int(end)
## test out the solution,
start_at = math.floor(\
     (datetime.utcnow() - timedelta(days=365)).timestamp())
list(give_me_intervals(start_at, 7))

[(1599607893, 1600212693),
 (1600212694, 1600817493),
 (1600817494, 1601422293),
 (1601422294, 1602027093),
 (1602027094, 1602631893),
 (1602631894, 1603236693),
 (1603236694, 1603841493),
 (1603841494, 1604446293),
 (1604446294, 1605051093),
 (1605051094, 1605655893),
 (1605655894, 1606260693),
 (1606260694, 1606865493),
 (1606865494, 1607470293),
 (1607470294, 1608075093),
 (1608075094, 1608679893),
 (1608679894, 1609284693),
 (1609284694, 1609889493),
 (1609889494, 1610494293),
 (1610494294, 1611099093),
 (1611099094, 1611703893),
 (1611703894, 1612308693),
 (1612308694, 1612913493),
 (1612913494, 1613518293),
 (1613518294, 1614123093),
 (1614123094, 1614727893),
 (1614727894, 1615332693),
 (1615332694, 1615937493),
 (1615937494, 1616542293),
 (1616542294, 1617147093),
 (1617147094, 1617751893),
 (1617751894, 1618356693),
 (1618356694, 1618961493),
 (1618961494, 1619566293),
 (1619566294, 1620171093),
 (1620171094, 1620775893),
 (1620775894, 1621380693),
 (1621380694, 1621985493),
 

In [6]:
# Set Reddit API Credentials
reddit = praw.Reddit(
     client_id="3gRvZgbUzCz_Tg",
     client_secret="BuNkAnAhJFZhtDMw303NyxiIonIkpg",
     user_agent="image-scraper"
)

In [None]:
sub_list = ['Cryptocurrency', 'Altcoin', 'Best_of_Crypto', 'BitcoinMarkets', 'Blockchain', 'CryptoMarkets', 
            'CryptoTechnology', 'CryptoTrade']

In [None]:
for x in sub_list:
    subreddit = x
    start_at = math.floor(\
    (datetime.utcnow() - timedelta(days=365)).timestamp())
    posts = []
    for interval in give_me_intervals(start_at, 7):
        pulled_posts = pull_posts_for(
            subreddit, interval[0], interval[1])
    
        posts.extend(pulled_posts)
        time.sleep(.500)

    ## WARNING: REDDIT WILL THROTTLE YOU IF YOU ARE ANNOYING! BE KIND!
    TIMEOUT_AFTER_COMMENT_IN_SECS = .350
    posts_from_reddit = []
    comments_from_reddit = []
    for submission_id in np.unique([ post['id'] for post in posts ]):
        submission = reddit.submission(id=submission_id)
        posts_from_reddit.append(submission)
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            comments_from_reddit.append(comment)
        
            if TIMEOUT_AFTER_COMMENT_IN_SECS > 0:
                time.sleep(TIMEOUT_AFTER_COMMENT_IN_SECS)

    print(len(posts_from_reddit))

    print(len(comments_from_reddit))

In [7]:
subreddit = 'Siacoin'
start_at = math.floor(\
    (datetime.utcnow() - timedelta(days=365)).timestamp())
posts = []
for interval in give_me_intervals(start_at, 7):
    pulled_posts = pull_posts_for(
        subreddit, interval[0], interval[1])
    
    posts.extend(pulled_posts)
    time.sleep(.500)
## ~ 4306
print(len(posts))
## ~ 4306
print(len(np.unique([ post['id'] for post in posts ])))

2379
2379


In [8]:
## WARNING: REDDIT WILL THROTTLE YOU IF YOU ARE ANNOYING! BE KIND!
TIMEOUT_AFTER_COMMENT_IN_SECS = .350
posts_from_reddit = []
comments_from_reddit = []
for submission_id in np.unique([ post['id'] for post in posts ]):
    submission = reddit.submission(id=submission_id)
    posts_from_reddit.append(submission)
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comments_from_reddit.append(comment)
        
        if TIMEOUT_AFTER_COMMENT_IN_SECS > 0:
            time.sleep(TIMEOUT_AFTER_COMMENT_IN_SECS)
## ~ 4306
print(len(posts_from_reddit))
## ~ 35216
print(len(comments_from_reddit))

In [None]:
for x in sub_list:

    # Define name of subreddit
    sub = x
    # Define the name of the directory to be created. Replace with your directory location.
    csv_dir = '../Data/Top/'

    # Define directory for parquet file.
    parquet_dir = '../Data/Parquet/'
    
    # Sort Submissions by: rising, new, hot, gilded, controversial, top
    # Replace 'top' with any of the sorting options from above
    # Max limit of 1000 requests
    # Estimated time of 9 minutes to run this cell

    d = []

    for submission in reddit.subreddit(sub).top(limit=None):
        d.append(                                            
            {                                                
                'Title': submission.title,
                'Score': submission.score,
                'Id':  submission.id,
                #'Url': submission.url,
                #'Author': submission.author,
                'Comments': submission.comments,
                'Created_utc': submission.created_utc,
                #'Original Content': submission.is_original_content,
                #'Flair Text': submission.link_flair_text,
                #'Full Name': submission.name,
                #'Locked': submission.locked,
                'Number of Comments': submission.num_comments,
                #'NSFW': submission.over_18,
                'Upvote Percentage': submission.upvote_ratio,
                'Text': submission.selftext
            }
        )

    df = pd.DataFrame(d)
    
    # Create date string for csv file name
    timestr = time.strftime("%Y%m%d")
    
    # Save dataframe to csv file
    df.to_csv(csv_dir + sub + '_' + timestr + '.csv', index = False)

    # Save to parquet file
    #df.to_parquet(parquet_dir + sub + '_' + timestr + '.parquet', engine='fastparquet')

    # Sleep 10 seconds to interrupt api connection
    sleep(10)