In [1]:
import math
import json
import requests
import itertools
import numpy as np
import time
import pandas as pd
from datetime import datetime, timedelta, date

In [2]:
def make_request(uri, max_retries = 5):
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)

In [3]:
def pull_comments_for(subreddit, start_at, end_at):
    
    def map_comments(comments):
        return list(map(lambda comment: {
            'post_id': comment['link_id'],
            'created_utc': comment['created_utc'],
            'comment_body': comment['body'],
            'comment_author': comment['author'],
            'comment_id': comment['id']}, comments))
    
    SIZE = 500
    URI_TEMPLATE = r'https://api.pushshift.io/reddit/comment/search/?subreddit={}&after={}&before={}&size={}&filter=author,body,created_utc,link_id,id'
    
    comment_collections = map_comments( \
        make_request(URI_TEMPLATE.format(subreddit, start_at, end_at, SIZE))['data'])
    n = len(comment_collections)
    while n == SIZE:
        last = comment_collections[-1]
        new_start_at = last['created_utc'] - (10)
        
        more_comments = map_comments( \
            make_request(URI_TEMPLATE.format(subreddit, new_start_at, end_at, SIZE))['data'])
        
        n = len(more_comments)
        comment_collections.extend(more_comments)
    return comment_collections


In [4]:
def give_me_intervals(start_at, number_of_days_per_interval = 2):
    
    end_at = math.ceil(datetime.utcnow().timestamp())
    print('Unix start time:',end_at)
    period = (86400 * number_of_days_per_interval)
    end = start_at + period
    yield (int(start_at), int(end))
    padding = 1
    while end <= end_at:
        start_at = end + padding
        end = (start_at - padding) + period
        yield int(start_at), int(end)
        
start_at = int(date(2018,1,1).strftime("%s")) #start of 2018     
intervals = give_me_intervals(start_at, 1)

In [5]:
subreddit = 'Singapore'
comments_list = []
author_list = []
post_id_list = []
created_utc_list = []
comment_id_list = []
for interval in intervals:
    pulled_comments = pull_comments_for(subreddit, interval[0], interval[1])
    comments_list.extend([x['comment_body'] for x in pulled_comments])
    author_list.extend([x['comment_author'] for x in pulled_comments])
    post_id_list.extend([x['post_id'] for x in pulled_comments])
    created_utc_list.extend([x['created_utc'] for x in pulled_comments])
    comment_id_list.extend([x['comment_id'] for x in pulled_comments])
    print('Comment count:',len(comments_list),end='\r')
    time.sleep(.100)

Unix start time: 1556320934
Comment count: 1205099

In [6]:
df = pd.DataFrame({'Post Id':post_id_list, 'Comment Author':author_list,'Comment':comments_list,'Comment Date':created_utc_list, 'Comment ID':comment_id_list})
df.head(20)


Unnamed: 0,Post Id,Comment Author,Comment,Comment Date,Comment ID
0,t3_7n4f8o,Pesantkie,Happy New Year guys!! \n\nWatching Taipei 101 ...,1514736068,drzz8xv
1,t3_7n4f8o,Clinching97,Happy New Year Redditporeans!,1514736081,drzz994
2,t3_7n875r,sense_make,2017: Get 2B license and a motorcycle\n2018: G...,1514736090,drzz9hu
3,t3_7n4f8o,Undikaze,Happy new year!! My new year resolution for re...,1514736102,drzz9rs
4,t3_7n4f8o,trendyapple,happy new year guys,1514736126,drzzad2
5,t3_7n97hy,AutoModerator,Your submission has been removed because it do...,1514736172,drzzbkv
6,t3_7n97jm,AutoModerator,Your submission has been removed because it do...,1514736188,drzzbyi
7,t3_7n4f8o,potatomaster420,happy 18th!,1514736209,drzzcgw
8,t3_7n97kt,megaband,"Happy new year, my fellow yellows",1514736225,drzzcuu
9,t3_7n4f8o,CheeeeezyCrust,"To those alone, happy new year to you on behal...",1514736231,drzzd02


In [7]:
df.to_csv('~/Desktop/Project_Forum/reddit_data_raw.csv', index = False)