In [1]:
# {
#     'id': s.id,
#     'author': s.author.name,
#     'created_utc': s.created_utc,
#     'link_flair_text': s.link_flair_text,
#     'author_flair_text': s.author_flair_text,
#     'num_comments': s.num_comments,
#     'score': s.score,
#     'ups': s.ups,
#     'downs': s.downs,
#     'selftext': s.selftext,
#     'title': s.title,
#     'gilded': s.gilded,
#     'subreddit': s.subreddit.name,
#     'name': s.name
# }

In [None]:
# {
#     'id': c.id,
#     'author': c.author.name,
#     'created_utc': c.created_utc,
#     'link_id': c.link_id,
#     'link_flair_text': c.submission.link_flair_text,
#     'author_flair_text': c.author_flair_text,
#     'score': c.score,
#     'ups': c.ups,
#     'downs': c.downs,
#     'body': c.body,
#     'gilded': c.gilded,
#     'subreddit': c.subreddit.name,
#     'name': c.name
# }


In [5]:
import rethinkdb as r
import delorean
from bigquery import get_client
from delorean import stops, parse, Delorean, epoch
from datetime import timedelta
from retrying import retry
import itertools
import time
import datetime
import sys
import os
import praw
import logging; logger=logging.getLogger(); logger.setLevel('INFO'); logging.basicConfig()

# Global helpers
c = r.connect(os.getenv('RETHINKDB_HOST', 'rethinkdb.reddit-analyzer'), os.getenv('RETHINKDB_PORT', 28015))
UA = '/r/%s flair analyzer by /u/coffenbacher for https://charlesoffenbacher.com' % sys.argv[1]
pr = praw.Reddit(user_agent=UA)
db = r.db('reddit')


client = get_client(os.getenv('BIGQUERY_PROJECT_ID'), 
                    service_account=os.getenv('BIGQUERY_SERVICEACCOUNT_ID'),
                    private_key_file='../'+os.getenv('BIGQUERY_PRIVATEKEY_FILE'), 
                    readonly=False)

# Helper
def grouper(n, iterable, fillvalue=None):
    args = [iter(iterable)] * n
    return ([e for e in t if e != None] for t in itertools.izip_longest(*args))

def get_submissions_between_epochs(start, end, subreddit):
    logger.info('Running search for range %s->%s' % (epoch(start).datetime.strftime('%x'), epoch(end).datetime.strftime('%x')))
    query = 'timestamp:%d..%d' % (start, end)
    return pr.search(query, subreddit=subreddit, sort='new', limit=1000, syntax='cloudsearch')

def get_bq_submission(s):
    return {
            'id': s.id,
            'author': s.author.name,
            'created_utc': s.created_utc,
            'link_flair_text': s.link_flair_text,
            'author_flair_text': s.author_flair_text,
            'num_comments': s.num_comments,
            'score': s.score,
            'ups': s.ups,
            'downs': s.downs,
            'selftext': s.selftext,
            'title': s.title,
            'gilded': s.gilded,
            'subreddit': s.subreddit.name,
            'name': s.name
        }

def get_bq_comment(comment):
    return {
            'id': comment.id,
            'author': comment.author.name,
            'created_utc': comment.created_utc,
            'link_id': comment.link_id,
            'link_flair_text': comment.submission.link_flair_text,
            'author_flair_text': comment.author_flair_text,
            'score': comment.score,
            'ups': comment.ups,
            'downs': comment.downs,
            'body': comment.body,
            'gilded': comment.gilded,
            'subreddit': comment.subreddit.name,
            'name': comment.name
           }

def extract_data(subreddit, epoch_increment=86400):
    current_progress = db.table('progress').get('current_%s' % subreddit).run(c)
    start = current_progress.get('epoch')
    end = start + epoch_increment
    
    submissions = []
    for s in get_submissions_between_epochs(start, end, subreddit):
        logger.info('Inserting submission %s' % s.id)
        submissions.append(get_bq_submission(s))
        
        logger.info('Getting comments for submission %s' % s.id)
        comments = []
        s.replace_more_comments(limit=None)
        for comment in praw.helpers.flatten_tree(s.comments):
            if isinstance(comment, praw.objects.Comment) and comment.author:
                print comment.id, comment.author.name
                comments.append(get_bq_comment(comment))

        for bq_comments in grouper(450, comments):
            logger.info('Inserting %s comments' % len(bq_comments))
            client.push_rows('reddit', '%s_comments' % subreddit, bq_comments, 'id')
    for bq_submissions in grouper(450, submissions):
        logger.info('Inserting %s submissions' % len(bq_submissions))
        client.push_rows('reddit', '%s_submissions' % subreddit, bq_submissions, 'id')
            
    db.table('progress').insert({'id': 'current_%s' % subreddit, 'epoch': end}, conflict="replace").run(c)
    db.table('progress').insert({'subreddit': subreddit, 'epoch': end, 'dt': r.now()}).run(c)
    
    
# if __name__ == "__main__":
#     logger.info('Extracting data for subreddit %s' % sys.argv[1])
#     extract_data(sys.argv[1])
#     time.sleep(int(os.getenv('DELAY', 0)))
while 1:
    extract_data('cfb')

INFO:root:Running search for range 09/21/10->09/22/10
INFO:requests.packages.urllib3.connectionpool:Starting new HTTPS connection (1): api.reddit.com
INFO:root:Inserting submission dh6e5
INFO:root:Getting comments for submission dh6e5
INFO:root:Inserting 2 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:oauth2client.client:Attempting refresh to obtain initial access_token
INFO:oauth2client.client:Refreshing access_token
INFO:root:Inserting submission dh209
INFO:root:Getting comments for submission dh209


c106kha czntix05
c109djz yep_yeppers
c1069sj sushionagrill
c106loa

INFO:root:Inserting 2 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dgxw0
INFO:root:Getting comments for submission dgxw0
INFO:root:Inserting submission dgvuu
INFO:root:Getting comments for submission dgvuu


 sushionagrill
c1044lf ms3000
c10462k RobbStark
c104xjz blueboybob
c1059b2 RobbStark
c1041xz Fergi
c104sa5 hesnothere
c105pro Arronwy
c10556x tandembandit
c10567j

INFO:root:Inserting 13 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dgvtr
INFO:root:Getting comments for submission dgvtr
INFO:root:Inserting submission dgvtg
INFO:root:Getting comments for submission dgvtg


 wcalvert
c105poh Arronwy
c10570h tandembandit
c104q7n corkill
c108vje mjxl47
c1047e6

INFO:root:Inserting 1 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting 6 submissions
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_submissions/insertAll?alt=json
INFO:root:Running search for range 09/22/10->09/23/10
INFO:root:Inserting submission dhhpx
INFO:root:Getting comments for submission dhhpx


 pthoresen
c108dn6

INFO:root:Inserting 3 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dhhm5
INFO:root:Getting comments for submission dhhm5


 redbenn
c108ovd levimills
c10lnbx BuckeyeCIC
c108m76 buttlordZ
c108hbg audiostatic82
c108jyz domcolosi
c108t4i mjxl47
c108lcm CreepyOkie
c108ozu OU405
c109oo1 m1ss1ontomars2k4
c10aukh wcalvert
c108no4 sushionagrill
c108pcv OU405
c1096vu xsvfan
c109rke hesnothere
c108swe mjxl47
c109fx8 osfn8
c108o47 TurkishRambo30
c108tcg mjxl47
c1096gq AU2010
c109foh mjxl47
c10l6me AlexisDeTocqueville
c108hak sushionagrill
c108s94 sushionagrill
c108vgu sushionagrill
c108wsx sushionagrill
c109c1s sushionagrill
c108t0p DeStijl
c108v7s sushionagrill
c108sf6 sushionagrill
c108ka5 domcolosi
c10bica wilk
c108ku9

INFO:root:Inserting 32 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dhfen
INFO:root:Getting comments for submission dhfen


 sushionagrill
c1090tt domcolosi
c109qea caboosian
c10as7p

INFO:root:Inserting 1 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting 3 submissions
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_submissions/insertAll?alt=json
INFO:root:Running search for range 09/23/10->09/24/10
INFO:root:Inserting submission dhzws
INFO:root:Getting comments for submission dhzws


 wjg10
c10c62b CEOofEarthMITTROMNEY
c10eube Firehunter
c10cr3g LiptonCB
c10dd0z ED21
c10dlcu LiptonCB
c10f32z ED21
c10era2 ClemsonPoker
c10fjmj LiptonCB
c10fm79 ClemsonPoker
c10foaz LiptonCB
c10g4cy ClemsonPoker
c10gmxt ms3000
c10jv0h ClemsonPoker
c10cykr sirgippy
c10czsn Yesh
c10jtb3 sushionagrill
c10dk2o sushionagrill
c10f41b YeaISeddit
c10fg00 sushionagrill
c10fbzl sushionagrill
c10ficw fidler
c10j3zq sushionagrill
c10j3jf sushionagrill
c10stlu fidler
c10fk0h YeaISeddit
c10flae fidler
c10izye sushionagrill
c10i6xl sushionagrill
c10fwfb YeaISeddit
c10dmws Ologhai
c10eqxj ClemsonPoker
c10f550 YeaISeddit
c10bvam ms3000
c10ef50 ZAHANMA
c10djk8 sushionagrill
c10c16u yoraylee
c10djxo sushionagrill
c10igcb sushionagrill
c10c312 tme001
c10eeo6 JdoubleE5000
c10ipeu sushionagrill
c10ld8x JdoubleE5000
c10ldiz sushionagrill
c10eshj ClemsonPoker
c10f3q6 JdoubleE5000
c10fe8f ClemsonPoker
c10cg6v Arronwy
c10den2 troymcdavis
c10erup ClemsonPoker
c10dp9d Arronwy
c10jcfc troymcdavis
c10jcu8 Arronwy
c

INFO:root:Inserting 86 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dhve4
INFO:root:Getting comments for submission dhve4


 bball0718
c10d8t0 treanorj
c10ckqc diatonic
c10coc2 PhilyCheese
c10cqvv LiptonCB
c10d28n ms3000
c10d775 wjg10
c10dja8 sushionagrill
c10ezz9 wjg10
c10ffu7 sushionagrill
c10dia9

INFO:root:Inserting 3 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dhv5f
INFO:root:Getting comments for submission dhv5f


 Ologhai
c10eo66 yoraylee
c10gc06 Ologhai
c10b4rd kagrocery
c10bad5 magusg
c10bms0 RogerMexico
c10drff eggery
c10dtvp eggery
c10e171 eggery
c10bda2 yep_yeppers
c10mpl5

INFO:root:Inserting 8 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dhull
INFO:root:Getting comments for submission dhull


 fuckyou_space
c10ax0b Fergi
c10k5fd

INFO:root:Inserting 6 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dhsqn
INFO:root:Getting comments for submission dhsqn


 Xulton
c10av57 diatonic
c10b8bi OU405
c10cn1r demeteloaf
c10c3ff Jortsfan
c10d9m9

INFO:root:Inserting 1 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting 5 submissions
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_submissions/insertAll?alt=json
INFO:root:Running search for range 09/24/10->09/25/10
INFO:root:Inserting submission dikz5
INFO:root:Getting comments for submission dikz5


 dolderer
c10gpe0

INFO:root:Inserting 9 comments
INFO:googleapiclient.discovery:URL being requested: POST https://www.googleapis.com/bigquery/v2/projects/stable-balancer-95721/datasets/reddit/tables/cfb_comments/insertAll?alt=json
INFO:root:Inserting submission dihfz
INFO:root:Getting comments for submission dihfz


 SlurmsMackenzie
c10h17d Homestar
c10j6gh Jakegarr
c10hj50 gbacon
c10k6mf gbacon
c10h72r Jakegarr
c10hwvz steamfolk
c10k6q0 gbacon
c10gzt3 ED21
c10g62x ClemsonPoker


KeyboardInterrupt: 