In [1]:
import praw
import pandas as pd
import configparser
import numpy as np
import datetime as dt
import pickle

# Setup Config for Reddit Data
# Store username/password data in reddit.config file
config = configparser.ConfigParser()
config.read('../reddit.config')

reddit = praw.Reddit(client_id=config['REDDIT_CONFIG']['client_id'],
                     client_secret=config['REDDIT_CONFIG']['client_secret'],
                     password=config['REDDIT_CONFIG']['password'],
                     user_agent=config['REDDIT_CONFIG']['user_agent'],
                     username=config['REDDIT_CONFIG']['username'])


def parse_comment(comment):
    try:
        author = comment.author.name
        author_flair = comment.author_flair_text
    except:
        author = 'None'
        author_flair = 'None'
    score = comment.score
    
    comment_id = comment
    comment_name = comment.name
    comment_fullname = comment.fullname
    comment_is_root = comment.is_root
    comment_parent = comment.parent()
    comment_approved_at_utc = comment.approved_at_utc
    comment_approved_by = comment.approved_by
    comment_created = comment.created
    comment_created_utc = comment.created_utc
    comment_created_utc_datetime = dt.datetime.fromtimestamp(comment.created_utc)
    comment_created_utc_date = created_utc_datetime.strftime(format='%d-%m-%y')
    comment_created_utc_time = created_utc_datetime.strftime(format='%H:%M:%S')
    comment_banned_at_utc = comment.banned_at_utc
    comment_banned_by = comment.banned_by
    comment_depth = comment.depth
    comment_num_reports = comment.num_reports
    comment_body = comment.body
    comment_body_parsed = comment.body.replace('\n',' ').replace('\t',' ').replace(',',' ')
    # Submission Details
    submission_id = comment.submission.id
    submission_title = comment.submission.title
    submission_created_utc = comment.submission.created_utc
    
    data = [author,
            author_flair,
            score,
            comment_id,
            comment_name,
            comment_fullname,
            comment_is_root,
            comment_parent,
            comment_approved_at_utc,
            comment_approved_by,
            comment_created,
            comment_created_utc,
            comment_created_utc_datetime,
            comment_created_utc_date,
            comment_created_utc_time,
            comment_banned_at_utc,
            comment_banned_by,
            comment_depth,
            comment_num_reports,
            comment_body,
            comment_body_parsed,
            submission_id,
            submission_title,
            submission_created_utc]
    
    return data
    
def pull_comments_and_save(submission_id):
    submission = reddit.submission(id=submission_id)
    submission_pickle_filename = 'data/{}_submission_pickle.p'.format(submission_id)
    pickle.dump(submission,open( submission_pickle_filename, "wb" ))
    game_comments = pd.DataFrame(columns=['author',
                                            'author_flair',
                                            'score',
                                            'comment_id',
                                            'comment_name',
                                            'comment_fullname',
                                            'comment_is_root',
                                            'comment_parent',
                                            'comment_approved_at_utc',
                                            'comment_approved_by',
                                            'comment_created',
                                            'comment_created_utc',
                                            'comment_created_utc_datetime',
                                            'comment_created_utc_date',
                                            'comment_created_utc_time',
                                            'comment_banned_at_utc',
                                            'comment_banned_by',
                                            'comment_depth',
                                            'comment_num_reports',
                                            'comment_body',
                                            'comment_body_parsed',
                                            'submission_id',
                                            'submission_title',
                                            'submission_created_utc'])
    print("Pulling Comments for Thread {} with {} Comments".format(submission.title, submission.num_comments))
    submission.comments.replace_more(limit=None)

    comment_queue = submission.comments[:]  # Seed with top-level
    comment_number = 1
    while comment_queue:
        comment = comment_queue.pop(0)
        try:
            data = parse_comment(comment)
            # Save Results
            game_comments.loc[comment_number] = data
        except:
            pass

        comment_queue.extend(comment.replies)
        comment_number += 1
    
    now = dt.datetime.now()
    print("Complete at: {}".format(now.strftime(format='%d-%m-%y %H:%M:%S')))
    return game_comments

games = pd.read_csv('Game_Thread_List_Only.csv')
threads = games['Thread ID'].tolist()

for submissionid in threads:
    try:
        df = pull_comments_and_save(submissionid)
        df.to_csv('data/{}.csv'.format(submissionid))
        df.to_pickle('data/{}.pickle'.format(submissionid))
    except:
        print("Pull for {} failed!!".format(submissionid))

In [2]:
thread_id = '7n9q5s'
df = pull_comments_and_save(thread_id)
df.to_csv('data/{}.csv'.format(submissionid))
df.to_pickle('data/{}.pickle'.format(submissionid))

Pulling Comments for Thread Game Thread: Washington Redskins (7-8) at New York Giants (2-13) with 612 Comments
Complete at: 03-02-18 16:27:22


NameError: name 'submissionid' is not defined

In [144]:
df = pull_comments_and_save(thread_id)

Pulling Comments for Thread Game Thread: Washington Redskins (7-8) at New York Giants (2-13) with 612 Comments
Complete at: 03-02-18 14:31:36


In [146]:
df.columns

Index(['author', 'author_flair', 'score', 'comment_id', 'comment_name',
       'comment_fullname', 'comment_is_root', 'comment_parent',
       'comment_approved_at_utc', 'comment_approved_by', 'comment_created',
       'comment_created_utc', 'comment_created_utc_datetime',
       'comment_created_utc_date', 'comment_created_utc_time',
       'comment_banned_at_utc', 'comment_banned_by', 'comment_depth',
       'comment_num_reports', 'comment_body', 'comment_body_parsed',
       'submission_id', 'submission_title', 'submission_created_utc'],
      dtype='object')

In [132]:
print(comment)
print(comment.name)
print(comment.fullname)
print(comment.is_root)
print(comment.parent())
print(comment.approved_at_utc)
print(comment.approved_by)
print(comment.created)
print(comment.created_utc)
created_utc_datetime = dt.datetime.fromtimestamp(comment.created_utc)
created_utc_date = created_utc_datetime.strftime(format='%d-%m-%y')
created_utc_time = created_utc_datetime.strftime(format='%H:%M:%S')
print(comment.banned_at_utc)
print(comment.banned_by)
print(comment.depth)
print(comment.num_reports)
print(comment.body)
print(comment.body.replace('\n',' ').replace('\t',' ').replace(',',' '))
# Submission Details
print(comment.submission.id)
print(comment.submission.title)
print(comment.submission.created_utc)

dl4xlrs
t1_dl4xlrs
t1_dl4xlrs
True
6rggif
None
None
1501836185.0
1501807385.0
None
None
0
None
I feel like I'm watching a video game with great graphics and physics but it doesn't have NFL licensing so they can't use real players likenesses
I feel like I'm watching a video game with great graphics and physics but it doesn't have NFL licensing so they can't use real players likenesses
6rggif
Game Thread: Arizona Cardinals (0-0) at Dallas Cowboys (0-0)
1501803005.0


In [10]:
submission = reddit.submission(id='6rggif')

In [16]:
comment = submission.comments[0]

# Try to look up timestamps after the fact

In [51]:
game_comments = pd.read_pickle('data/Data_notimestamps/5sa4s4.pickle')

In [64]:
game_comments['Com32ment Name'][1].split('_')[1]

'dddh0kf'

In [72]:
def get_utc(row):
    created_utc = reddit.comment(row['Comment Name'][1].split('_')[1]).created_utc
    return created_utc

In [77]:
game_comments_small = game_comments.head(50)

# Test Script

In [4]:
import praw
import pandas as pd
import configparser
import numpy as np
import datetime as dt
import pickle
import logging

logging.basicConfig(filename='pull_comments.log',
                    level=logging.DEBUG,
                    format='%(asctime)s %(message)s',
                    datefmt='%m/%d/%Y %I:%M:%S %p')


# Setup Config for Reddit Data
# Store username/password data in reddit.config file
config = configparser.ConfigParser()
config.read('../reddit.config')

reddit = praw.Reddit(client_id=config['REDDIT_CONFIG']['client_id'],
                     client_secret=config['REDDIT_CONFIG']['client_secret'],
                     password=config['REDDIT_CONFIG']['password'],
                     user_agent=config['REDDIT_CONFIG']['user_agent'],
                     username=config['REDDIT_CONFIG']['username'])


def parse_comment(comment):
    try:
        author = comment.author.name
        author_flair = comment.author_flair_text
    except:
        author = 'None'
        author_flair = 'None'
    score = comment.score

    comment_id = comment
    comment_name = comment.name
    comment_fullname = comment.fullname
    comment_is_root = comment.is_root
    comment_parent = comment.parent()
    comment_approved_at_utc = comment.approved_at_utc
    comment_approved_by = comment.approved_by
    comment_created = comment.created
    comment_created_utc = comment.created_utc
    comment_created_utc_datetime = dt.datetime.fromtimestamp(comment.created_utc)
    comment_created_utc_date = comment_created_utc_datetime.strftime(format='%d-%m-%y')
    comment_created_utc_time = comment_created_utc_datetime.strftime(format='%H:%M:%S')
    comment_banned_at_utc = comment.banned_at_utc
    comment_banned_by = comment.banned_by
    comment_depth = comment.depth
    comment_num_reports = comment.num_reports
    comment_body = comment.body
    comment_body_parsed = comment.body.replace('\n',' ').replace('\t',' ').replace(',',' ')
    # Submission Details
    submission_id = comment.submission.id
    submission_title = comment.submission.title
    submission_created_utc = comment.submission.created_utc

    data = [author,
            author_flair,
            score,
            comment_id,
            comment_name,
            comment_fullname,
            comment_is_root,
            comment_parent,
            comment_approved_at_utc,
            comment_approved_by,
            comment_created,
            comment_created_utc,
            comment_created_utc_datetime,
            comment_created_utc_date,
            comment_created_utc_time,
            comment_banned_at_utc,
            comment_banned_by,
            comment_depth,
            comment_num_reports,
            comment_body,
            comment_body_parsed,
            submission_id,
            submission_title,
            submission_created_utc]

    return data

def pull_comments_and_save(submission_id):
    submission = reddit.submission(id=submission_id)
    game_comments = pd.DataFrame(columns=['author',
                                            'author_flair',
                                            'score',
                                            'comment_id',
                                            'comment_name',
                                            'comment_fullname',
                                            'comment_is_root',
                                            'comment_parent',
                                            'comment_approved_at_utc',
                                            'comment_approved_by',
                                            'comment_created',
                                            'comment_created_utc',
                                            'comment_created_utc_datetime',
                                            'comment_created_utc_date',
                                            'comment_created_utc_time',
                                            'comment_banned_at_utc',
                                            'comment_banned_by',
                                            'comment_depth',
                                            'comment_num_reports',
                                            'comment_body',
                                            'comment_body_parsed',
                                            'submission_id',
                                            'submission_title',
                                            'submission_created_utc'])
    logging.info("ROBSLOG Pulling Comments for Thread {} with {} Comments".format(submission.title, submission.num_comments))
    submission.comments.replace_more(limit=None)

    submission_pickle_filename = 'data/{}_submission_pickle.p'.format(submission_id)
    pickle.dump(submission,open( submission_pickle_filename, "wb" ))

    comment_queue = submission.comments[:]  # Seed with top-level
    comment_number = 1

    while comment_queue:
        comment = comment_queue.pop(0)
        try:
            data = parse_comment(comment)
            # Save Results
            game_comments.loc[comment_number] = data
        except:
            pass
            logging.warning('ROBSLOG Count pull comment{}'.format(comment))

        comment_queue.extend(comment.replies)
        comment_number += 1

    now = dt.datetime.now()
    logging.info("ROBSLOG Complete at: {}".format(now.strftime(format='%d-%m-%y %H:%M:%S')))
    return game_comments

games = pd.read_csv('Game_Thread_List_Only.csv')
threads = games['Thread ID'].tolist()

threads = ['6xbdec']
for submissionid in threads:
    try:
        df = pull_comments_and_save(submissionid)
        df.to_csv('data/{}_parsed_comments.csv'.format(submissionid))
        df.to_pickle('data/{}_parsed_comments.pickle'.format(submissionid))
    except:
        logging.warning("ROBSLOG Pull for {} failed!!".format(submissionid))

In [7]:
games = games.sort_values('Number of Comments')