## Data Collection from Reddit Cryptocurrency Subreddits

https://medium.com/@pasdan/how-to-scrap-reddit-using-pushshift-io-via-python-a3ebcc9b83f4

https://medium.com/swlh/how-to-scrape-large-amounts-of-reddit-data-using-pushshift-1d33bde9286



In [10]:
import math
import json
import requests
import itertools
import numpy as np
import time
from datetime import datetime, timedelta
import praw
import pandas as pd

In [3]:
def make_request(uri, max_retries = 5):
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)

In [4]:
def pull_posts_for(subreddit, start_at, end_at):
    
    def map_posts(posts):
        return list(map(lambda post: {
            'id': post['id'],
            'created_utc': post['created_utc'],
            'prefix': 't4_'
        }, posts))
    
    SIZE = 500
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/search/submission?subreddit={}&after={}&before={}&size={}'
    
    post_collections = map_posts( \
        make_request( \
            URI_TEMPLATE.format( \
                subreddit, start_at, end_at, SIZE))['data'])
    n = len(post_collections)
    while n == SIZE:
        last = post_collections[-1]
        new_start_at = last['created_utc'] - (10)
        
        more_posts = map_posts( \
            make_request( \
                URI_TEMPLATE.format( \
                    subreddit, new_start_at, end_at, SIZE))['data'])
        
        n = len(more_posts)
        post_collections.extend(more_posts)

    return post_collections

In [5]:
def give_me_intervals(start_at, number_of_days_per_interval = 3):
    
    end_at = math.ceil(datetime.utcnow().timestamp())
        
    ## 1 day = 86400,
    period = (86400 * number_of_days_per_interval)
    end = start_at + period
    yield (int(start_at), int(end))
    padding = 1
    while end <= end_at:
        start_at = end + padding
        end = (start_at - padding) + period
        yield int(start_at), int(end)
## test out the solution,
start_at = math.floor(\
     (datetime.utcnow() - timedelta(days=365)).timestamp())
list(give_me_intervals(start_at, 7))

[(1599607893, 1600212693),
 (1600212694, 1600817493),
 (1600817494, 1601422293),
 (1601422294, 1602027093),
 (1602027094, 1602631893),
 (1602631894, 1603236693),
 (1603236694, 1603841493),
 (1603841494, 1604446293),
 (1604446294, 1605051093),
 (1605051094, 1605655893),
 (1605655894, 1606260693),
 (1606260694, 1606865493),
 (1606865494, 1607470293),
 (1607470294, 1608075093),
 (1608075094, 1608679893),
 (1608679894, 1609284693),
 (1609284694, 1609889493),
 (1609889494, 1610494293),
 (1610494294, 1611099093),
 (1611099094, 1611703893),
 (1611703894, 1612308693),
 (1612308694, 1612913493),
 (1612913494, 1613518293),
 (1613518294, 1614123093),
 (1614123094, 1614727893),
 (1614727894, 1615332693),
 (1615332694, 1615937493),
 (1615937494, 1616542293),
 (1616542294, 1617147093),
 (1617147094, 1617751893),
 (1617751894, 1618356693),
 (1618356694, 1618961493),
 (1618961494, 1619566293),
 (1619566294, 1620171093),
 (1620171094, 1620775893),
 (1620775894, 1621380693),
 (1621380694, 1621985493),
 

In [6]:
# Set Reddit API Credentials
reddit = praw.Reddit(
     client_id="3gRvZgbUzCz_Tg",
     client_secret="BuNkAnAhJFZhtDMw303NyxiIonIkpg",
     user_agent="image-scraper"
)

In [11]:
# Define the name of the directory to be created. Replace with your directory location.
csv_dir = '../Data/Reddit_Comments/'

# Define directory for parquet file.
parquet_dir = '../Data/Parquet/'

In [None]:
sub_list = ['Cryptocurrency', 'Altcoin', 'Bitcoin', 'Ethereum', 'BasicAttentionToken', 'Best_of_Crypto', 'BitcoinMarkets', 
            'Blockchain', 'CryptoMarkets', 'CryptoTechnology', 'CryptoTrade', 'Algorand', 'Tezos', 'cosmosnetwork',
            'Polkadot', 'Cardano', 'Ankr']

In [None]:
for x in sub_list:
    subreddit = x
    start_at = math.floor(\
    (datetime.utcnow() - timedelta(days=365)).timestamp())
    posts = []
    for interval in give_me_intervals(start_at, 7):
        pulled_posts = pull_posts_for(
            subreddit, interval[0], interval[1])
    
        posts.extend(pulled_posts)
        time.sleep(.500)

    # Pull reddit posts and comments
    TIMEOUT_AFTER_COMMENT_IN_SECS = .350
    posts_from_reddit = []
    comments_from_reddit = []
    for submission_id in np.unique([ post['id'] for post in posts ]):
        submission = reddit.submission(id=submission_id)
        posts_from_reddit.append(submission)
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            comments_from_reddit.append(comment)
        
            if TIMEOUT_AFTER_COMMENT_IN_SECS > 0:
                time.sleep(TIMEOUT_AFTER_COMMENT_IN_SECS)

    # Create DataFrame
    p = pd.DataFrame(posts_from_reddit)
    c = pd.DataFrame(comments_from_reddit)
    
    # Create date string for csv file name
    timestr = time.strftime("%Y%m%d")
    
    # Save dataframe to csv file
    p.to_csv(csv_dir + subreddit + 'posts_'  + timestr + '.csv', index = False)
    c.to_csv(csv_dir + subreddit + 'comments_'  + timestr + '.csv', index = False)

    # Save dataframe to parquet file
    p.to_parquet(parquet_dir + subreddit + 'posts_' + timestr + '.parquet', engine='fastparquet')
    c.to_parquet(parquet_dir + subreddit + 'comments_' + timestr + '.parquet', engine='fastparquet')

    time.sleep(5)

In [7]:
subreddit = 'Siacoin'
start_at = math.floor(\
    (datetime.utcnow() - timedelta(days=365)).timestamp())
posts = []
for interval in give_me_intervals(start_at, 7):
    pulled_posts = pull_posts_for(
        subreddit, interval[0], interval[1])
    
    posts.extend(pulled_posts)
    time.sleep(.500)
## ~ 4306
print(len(posts))
## ~ 4306
print(len(np.unique([ post['id'] for post in posts ])))

2379
2379


In [8]:
## WARNING: REDDIT WILL THROTTLE YOU IF YOU ARE ANNOYING! BE KIND!
TIMEOUT_AFTER_COMMENT_IN_SECS = .350
posts_from_reddit = []
comments_from_reddit = []
for submission_id in np.unique([ post['id'] for post in posts ]):
    submission = reddit.submission(id=submission_id)
    posts_from_reddit.append(submission)
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comments_from_reddit.append(comment)
        
        if TIMEOUT_AFTER_COMMENT_IN_SECS > 0:
            time.sleep(TIMEOUT_AFTER_COMMENT_IN_SECS)
## ~ 4306
print(len(posts_from_reddit))
## ~ 35216
print(len(comments_from_reddit))

2379
14860


In [19]:
# Create dataframe
p = pd.DataFrame(posts_from_reddit)
c = pd.DataFrame(comments_from_reddit)
    
# Create date string for csv file name
timestr = time.strftime("%Y%m%d")
    
# Save dataframe to csv file
p.to_csv(csv_dir + subreddit + 'posts_' + timestr + '.csv', index = False)
c.to_csv(csv_dir + subreddit + 'comments_' + timestr + '.csv', index = False)

# Save dataframe to parquet file
#p.to_parquet(parquet_dir + subreddit + '_' + timestr + '.parquet', engine='fastparquet')
#c.to_parquet(parquet_dir + subreddit + '_' + timestr + '.parquet', engine='fastparquet')

In [20]:
#!pip install pmaw

Collecting pmaw
  Downloading pmaw-1.1.0-py3-none-any.whl (18 kB)
Installing collected packages: pmaw
Successfully installed pmaw-1.1.0


In [21]:
import pandas as pd
from pmaw import PushshiftAPI
import datetime as dt

In [63]:
# Set date intervals
before = int(dt.datetime(2021,9,1,0,0).timestamp())
after = int(dt.datetime(2021,8,1,0,0).timestamp())

In [72]:
date1 = dt.date(2004, 9, 25)
date2 = dt.date(2004, 10, 8)
day = dt.timedelta(days=1)

while date1 <= date2:
    date1 = date1 + day
    print(date1.strftime('%Y.%m.%d'))

2004.09.26
2004.09.27
2004.09.28
2004.09.29
2004.09.30
2004.10.01
2004.10.02
2004.10.03
2004.10.04
2004.10.05
2004.10.06
2004.10.07
2004.10.08
2004.10.09


In [49]:
api = PushshiftAPI()
subreddit="Cryptocurrency"
limit=100000
comments = api.search_comments(subreddit=subreddit, limit=limit, before=before, after=after)
print(f'Retrieved {len(comments)} comments from Pushshift')

Total:: Success Rate: 95.15% - Requests: 1052 - Batches: 106 - Items Remaining: 0
Retrieved 100000 comments from Pushshift


In [50]:
# Create dataframe
comments_df = pd.DataFrame(comments)

In [51]:
pd.set_option('max_columns', None)

In [52]:
# preview the comments data
comments_df.head(5)

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,author_fullname,author_patreon_flair,author_premium,awarders,body,collapsed_because_crowd_control,collapsed_reason_code,comment_type,created_utc,gildings,id,is_submitter,link_id,locked,no_follow,parent_id,permalink,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,top_awarded_type,total_awards_received,treatment_tags,media_metadata,distinguished,archived,body_sha1,can_gild,collapsed,collapsed_reason,controversiality,gilded,retrieved_utc,score_hidden,subreddit_name_prefixed,subreddit_type,author_cakeday,edited
0,[],,mazz0r2000,,Warning-level3,"[{'e': 'text', 't': '4 - 5 years account age. ...",,4 - 5 years account age. 63 - 125 comment karma.,dark,richtext,t2_4bnyfpj,False,False,[],"This is true, but that doesn't mean we should ...",,,,1628336879,{},h81oipu,False,t3_ozrsml,False,True,t3_ozrsml,/r/CryptoCurrency/comments/ozrsml/cryptocurren...,1628428000.0,3,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,
1,[],,never_trust_a_whale,,Transitioning,"[{'e': 'text', 't': ' '}]",,,dark,richtext,t2_96ekeodg,False,False,[],"1/3 today, 3x in few months maybe. Hodl strong...",,,,1628336877,{},h81oin3,False,t3_ozrjtd,False,True,t3_ozrjtd,/r/CryptoCurrency/comments/ozrjtd/yesterday_i_...,1628428000.0,0,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,
2,[],,National-Ad7627,,Transitioning,"[{'e': 'text', 't': ' '}]",,,dark,richtext,t2_7pjcxti7,False,False,[],Bro. you are right. but we have accumulation p...,,,,1628336877,{},h81oimh,False,t3_ozinnf,False,True,t1_h81objv,/r/CryptoCurrency/comments/ozinnf/daily_discus...,1628428000.0,1,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,
3,[],,the-zoo-keeper29,,Transitioning,"[{'e': 'text', 't': ' '}]",,,dark,richtext,t2_435bezv3,False,False,[],It’s a beautiful thing to watch,,,,1628336877,{},h81oimb,False,t3_ozq8zv,False,True,t3_ozq8zv,/r/CryptoCurrency/comments/ozq8zv/eth_crosses_...,1628428000.0,1,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,
4,[],,adithya_chittem,,,[],,,,text,t2_1o5hvyyx,False,False,[],Genuinely insane because eth still has so much...,,,,1628336876,{},h81oilb,False,t3_ozq8zv,False,True,t3_ozq8zv,/r/CryptoCurrency/comments/ozq8zv/eth_crosses_...,1628428000.0,2,True,False,CryptoCurrency,t5_2wlj3,,0,[],,,,,,,,,,,,,,,


In [56]:
comments_df.columns

Index(['all_awardings', 'associated_award', 'author',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'body', 'collapsed_because_crowd_control', 'collapsed_reason_code',
       'comment_type', 'created_utc', 'gildings', 'id', 'is_submitter',
       'link_id', 'locked', 'no_follow', 'parent_id', 'permalink',
       'retrieved_on', 'score', 'send_replies', 'stickied', 'subreddit',
       'subreddit_id', 'top_awarded_type', 'total_awards_received',
       'treatment_tags', 'media_metadata', 'distinguished', 'archived',
       'body_sha1', 'can_gild', 'collapsed', 'collapsed_reason',
       'controversiality', 'gilded', 'retrieved_utc', 'score_hidden',
       'subreddit_name_prefixed', 'subreddit_type', 'author_cakeday',
       'edited'],
      dtype='o

In [57]:
comments_df.shape

(100000, 51)

In [55]:
comments_df.to_csv('../Data/Reddit_Comments/Cryptocurrency_09012021.csv', header=True, index=False, columns=list(comments_df.axes[1]))

In [58]:
#!pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-5.0.0-cp38-cp38-win_amd64.whl (14.5 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-5.0.0


In [59]:
# Write Pandas Dataframe to parquet
import pyarrow as pa
import pyarrow.parquet as pq

In [61]:
# Convert DataFrame to Apache Arrow Table
table = pa.Table.from_pandas(comments_df)

In [None]:
# Parquet write table
#pq.write_table(table, 'file_name.parquet')

In [62]:
# Parquet with GZIP compression
pq.write_table(table, '../Data/Reddit_Comments/Cryptocurrency_09012021.parquet', compression='GZIP')