In [92]:
import time
import datetime
import requests
import json
import pandas as pd
import praw
import re

In [127]:
# set datetime string pattern
date_pattern = '%Y-%m-%d %H:%M:%S' # e.g. 1988-02-18 18:00:00
# date limit for search
# expects a string following the format set above in date_pattern
limit_date = '2018-06-18 00:00:00'
# set interval used to split time and run queries. Scraping time is zproportionally inverse to this number
# expects an int/float
minutes = 60
# set subset of useful columns from submissions data
sub_subset = ['author','created_utc','full_link','id','num_comments','permalink','retrieved_on','subreddit','subreddit_id','title','timestamp']

# transform timestamp strings into Epoch Unix notation
# visit https://www.epochconverter.com for further documentatiion
# expects a string following the format set above in time_pattern
def get_epoch(date_time):
    return int(time.mktime(time.strptime(date_time,date_pattern)))

# calculates interval in seconds. 
def min_interval(minutes): 
    return minutes * 60

# transforms Epoch Unix into datetime objects
def get_date(submission):
    time = submission
    return datetime.datetime.fromtimestamp(time)

# gets string-formatted current time. Time zone: UTC/GMT
now = time.strftime(time_pattern, time.gmtime())

# creates list of Epoch Unix notation times
time_splits = list(range(get_epoch(limit_date),get_epoch(now),min_interval(minutes)))
# calculates the number of iterations
length = len(time_splits)

# URL setup 
# visit https://github.com/pushshift/api for further documentation
# base for query
base_url = 'https://api.pushshift.io/reddit/search/submission/?'
# subreddit to be queried
# case insensitive
subreddit = 'nootropics'
# max number of search results per iteration [1-500]
size=500

# starts empty pandas DataFrame
sub_df = pd.DataFrame()

# loops through iterations
for i in range(0,length-1,1):
    # time queries
    after = time_splits[i]
    before = time_splits[i+1]
    
    # make get request for data
    r = requests.get(base_url+'subreddit='+ subreddit +'&after='+ str(after) +'&before='+ str(before) +'&size='+ str(size))
    
    # load json returned into a pandas DataFrame
    json_data = json.loads(r.text)['data']
    json_df = pd.DataFrame(json_data)
    sub_df = pd.concat([sub_df,json_df])
    
    print('cur_iter:',str(i+1)+' / '+str(length),'scraped_len:',str(len(json_data)),'total_len:',str(len(sub_df)))

print('submission scraping done')
# set proper indexes for rows
sub_df = sub_df.reset_index(drop=True)

# get datetime
sub_df['timestamp'] = sub_df['created_utc'].apply(get_date)

# export csv
sub_df[sub_subset].to_csv('./submission_raw_data.csv',index=False)
print('submission file saved')

# start reddit instance with dev app permissions
reddit = praw.Reddit(client_id='vxr4dL9StStsFw',
                     client_secret='_gFl4VC3qVasOMBb0RsCftkqxRs',
                     user_agent='ped-subr',
                     username='fsorodrigues',
                     password='adv28742')

# create dict sctructure to store scraped data
comments_dict = {'submission_id':[],'body':[],'comment_id':[],'parent_id':[],'created_utc':[],'author':[],'score':[]}
    
# loop over submission ids scraped above and get comments
list_ids = sub_df['id']
length_ids = len(list_ids)
x = 1

for submission in list_ids:
    # request submission
    get_submission = reddit.submission(id=submission)

    # handle replace more
    # see https://praw.readthedocs.io/en/latest/tutorials/comments.html#the-replace-more-method for further documentation
    get_submission.comments.replace_more(limit=None)
    
    # get flattened list of comments (all levels/tiers)
    list_comments = get_submission.comments.list()
    length_comments = len(list_comments)
    
    print('cur_iter: '+str(x)+' / '+str(length_ids),'scraped_len:',str(length_comments),'total_len:',str(len(comments_dict['submission_id'])))
    x += 1
    
    # extract data from response and pass it to dict format
    for comment in list_comments:
        comments_dict['submission_id'].append(submission)
        comments_dict['body'].append(comment.body)
        comments_dict['comment_id'].append(comment.id)
        comments_dict['parent_id'].append(comment.parent_id)
        comments_dict['created_utc'].append(comment.created_utc)
        comments_dict['author'].append(comment.author)
        comments_dict['score'].append(comment.score)

# load json returned into a pandas DataFrame
comm_df = pd.DataFrame(comments_dict)

print('comment scraping done')
# set proper indexes for rows
comm_df = comm_df.reset_index(drop=True)

# get datetime
comm_df['timestamp'] = comm_df['created_utc'].apply(get_date)

# remove pattern
comm_df['parent_id_edit'] = comm_df['parent_id'].str.replace(r't\d_','')

# export csv
comm_df.to_csv('./comment_raw_data.csv',index=False)
print('comment file saved')

cur_iter: 1 / 45 scraped_len: 0 total_len: 0
cur_iter: 2 / 45 scraped_len: 1 total_len: 1
cur_iter: 3 / 45 scraped_len: 1 total_len: 2
cur_iter: 4 / 45 scraped_len: 1 total_len: 3
cur_iter: 5 / 45 scraped_len: 0 total_len: 3
cur_iter: 6 / 45 scraped_len: 1 total_len: 4
cur_iter: 7 / 45 scraped_len: 2 total_len: 6
cur_iter: 8 / 45 scraped_len: 4 total_len: 10
cur_iter: 9 / 45 scraped_len: 1 total_len: 11
cur_iter: 10 / 45 scraped_len: 1 total_len: 12
cur_iter: 11 / 45 scraped_len: 4 total_len: 16
cur_iter: 12 / 45 scraped_len: 0 total_len: 16
cur_iter: 13 / 45 scraped_len: 1 total_len: 17
cur_iter: 14 / 45 scraped_len: 2 total_len: 19
cur_iter: 15 / 45 scraped_len: 2 total_len: 21
cur_iter: 16 / 45 scraped_len: 1 total_len: 22
cur_iter: 17 / 45 scraped_len: 3 total_len: 25
cur_iter: 18 / 45 scraped_len: 1 total_len: 26
cur_iter: 19 / 45 scraped_len: 3 total_len: 29
cur_iter: 20 / 45 scraped_len: 2 total_len: 31
cur_iter: 21 / 45 scraped_len: 1 total_len: 32
cur_iter: 22 / 45 scraped_len

Unnamed: 0,author,created_utc,full_link,id,num_comments,permalink,retrieved_on,subreddit,subreddit_id,title,url,timestamp
0,Vargox,1529298102,https://www.reddit.com/r/Nootropics/comments/8...,8rwxj8,11,/r/Nootropics/comments/8rwxj8/experience_so_fa...,1529298103,Nootropics,t5_2r81c,Experience so far with phenylpiracetam,https://www.reddit.com/r/Nootropics/comments/8...,2018-06-18 01:01:42
1,peptidehunter,1529302572,https://www.reddit.com/r/Nootropics/comments/8...,8rxaxx,70,/r/Nootropics/comments/8rxaxx/congress_is_cons...,1529302573,Nootropics,t5_2r81c,Congress is considering a bill that would expa...,https://www.washingtonpost.com/news/wonk/wp/20...,2018-06-18 02:16:12
2,tronatula,1529308358,https://www.reddit.com/r/Nootropics/comments/8...,8rxr6h,10,/r/Nootropics/comments/8rxr6h/concerta_is_the_...,1529308360,Nootropics,t5_2r81c,"Concerta is the only ADHD drug in Vietnam, it ...",https://www.reddit.com/r/Nootropics/comments/8...,2018-06-18 03:52:38
3,pieandablowie,1529313196,https://www.reddit.com/r/Nootropics/comments/8...,8ry3qc,15,/r/Nootropics/comments/8ry3qc/expired_aniracet...,1529313197,Nootropics,t5_2r81c,"Expired Aniracetam, looking for a European sup...",https://www.reddit.com/r/Nootropics/comments/8...,2018-06-18 05:13:16
4,priyankasharma5490,1529317165,https://www.reddit.com/r/Nootropics/comments/8...,8ryf07,2,/r/Nootropics/comments/8ryf07/best_place_to_bu...,1529317167,Nootropics,t5_2r81c,Best Place To Buy Sun Modalert online - Modale...,https://www.worldpharmameds.com/product/modale...,2018-06-18 06:19:25
5,MrNeurotypical,1529319090,https://www.reddit.com/r/Nootropics/comments/8...,8rykld,17,/r/Nootropics/comments/8rykld/diy_telomerase_a...,1529319091,Nootropics,t5_2r81c,DIY Telomerase activators?,https://www.reddit.com/r/Nootropics/comments/8...,2018-06-18 06:51:30
6,tehothealkaloid,1529319904,https://www.reddit.com/r/Nootropics/comments/8...,8ryn88,2,/r/Nootropics/comments/8ryn88/nootropics_for_t...,1529319905,Nootropics,t5_2r81c,Nootropics for TBI's?,https://www.reddit.com/r/Nootropics/comments/8...,2018-06-18 07:05:04
7,theibbster,1529320498,https://www.reddit.com/r/Nootropics/comments/8...,8ryp81,5,/r/Nootropics/comments/8ryp81/gaining_from_exp...,1529320499,Nootropics,t5_2r81c,Gaining from explaining: Learning improves fro...,https://www.sciencedirect.com,2018-06-18 07:14:58
8,radonase,1529320962,https://www.reddit.com/r/Nootropics/comments/8...,8ryqon,1,/r/Nootropics/comments/8ryqon/which_racetam_do...,1529320962,Nootropics,t5_2r81c,Which racetam + dosage for verbal fluency/silv...,https://www.reddit.com/r/Nootropics/comments/8...,2018-06-18 07:22:42
9,EastHelp,1529322339,https://www.reddit.com/r/Nootropics/comments/8...,8ryv4h,30,/r/Nootropics/comments/8ryv4h/cordyceps_amazin...,1529322340,Nootropics,t5_2r81c,Cordyceps - amazing stuff,https://www.reddit.com/r/Nootropics/comments/8...,2018-06-18 07:45:39
