In [69]:
import pandas as pd
import numpy as np
import requests
import time

# thanks to https://stackoverflow.com/questions/5105517/deep-copy-of-a-dict-in-python for copy.deepcopy
import copy

In [81]:
# A few constants
# number of posts we want to retrieve to have a decent basis for modeling
DESIRED_POSTS_PER_SUBREDDIT = 10000

# number of periods to break the subreddit's life into for sampling
NUM_EPOCH_PERIODS = 20

# PushShift URL base for submission operations
PUSHSHIFT_BASE_URL = 'https://api.pushshift.io/reddit/search/submission'

# time between HTTP requests to avoid triggering PushShift's throttling mechanism
REQUEST_SLEEP_TIME = 2 

# fields we want to retrieve per submission - mainly need subreddit, timestamp, title, text
# grabbing a couple of other things that might be interesting
SUBMISSION_FIELDS = ['subreddit', 'id', 'title', 'author', 'created_utc', 'score', 'selftext', 'num_comments']

# the basic parameters for an overall metadata query - just add subreddit and optional "before/after" epochs and stir!
# Skipping posts that are just videos
# Getting only posts that are "self" posts, i.e., ones that aren't just a link to somewhere else
METADATA_BASE_PARAMS = {'metadata': True, 'size': 0, 'is_self': True, 'is_video': False}

# the basic parameters for a lifetime period query - just add subreddit, sort direction and stir!
# Skipping posts that are just videos
# Getting only posts that are "self" posts, i.e., ones that aren't just a link to somewhere else
LIFETIME_PERIOD_BASE_PARAMS = {'fields': ['created_utc'], 'size': 1, 'sort_type': 'created_utc', 'is_self': True, 'is_video': False}

# the basic parameters for retrieving submissions - just add subreddit, 'after' and stir!
# Skipping posts that are just videos
# Getting only posts that are "self" posts, i.e., ones that aren't just a link to somewhere else
RETRIEVE_SUBS_BASE_PARAMS = {'fields': SUBMISSION_FIELDS, 'is_self': True, 'is_video': False}

# max submissions to retrieve per request
MAX_SUBS_PER_REQUEST = 250

In [58]:
# Define our subreddits of interest
# Making this a list so we can expand to a multi-class problem if we want later
subreddits = ['jazz', 'classicalmusic']

In [59]:
# Little utility function just to check our HTTP error code and throw an exception if it's not 200
def safe_request(url, params):
    time.sleep(REQUEST_SLEEP_TIME)
    
    try:
        response = requests.get(url, params)
        err = response.raise_for_status()
    except:
        print(err)
        
    return response

In [68]:
# Function to retrieve metadata in preparation for sampling
# Returns a list of dictionaries of the form { period: int, start_epoch: int, num_posts: int, total_posts: int}
# Also validates that we have enough posts overall for modeling
# Note this doesn't deal with the max chunk size issue - that's in the main retrieve function

def retrieve_period_metadata(subreddit):
    # Make sure we got a string
    assert type(subreddit) == str

    results = []

    # metadata query for # lifetime posts
    metadata_params = copy.deepcopy(METADATA_BASE_PARAMS)
    metadata_params['subreddit'] = subreddit
    response = safe_request(PUSHSHIFT_BASE_URL, metadata_params)
    lifetime_num_posts = response.json()['metadata']['total_results']
    print(f'Subreddit: {subreddit}, lifetime_num_posts: {lifetime_num_posts}')

    # Validate the number of posts for modeling purposes
    if lifetime_num_posts < DESIRED_POSTS_PER_SUBREDDIT:
        raise Exception(f'Insufficient number of posts in subreddit {subreddit} for analysis - needed {DESIRED_POSTS_PER_SUBREDDIT}, got {lifetime_num_posts}')

    # Get the first post's create epoch
    first_post_params = copy.deepcopy(LIFETIME_PERIOD_BASE_PARAMS)
    first_post_params['subreddit'] = subreddit
    first_post_params['sort'] = 'asc'
    response = safe_request(PUSHSHIFT_BASE_URL, first_post_params)
    first_post_timestamp = response.json()['data'][0]['created_utc']
    print(f'Subreddit: {subreddit}, first_post_timestamp: {first_post_timestamp}')

    # Get the last post's create epoch
    latest_post_params = copy.deepcopy(LIFETIME_PERIOD_BASE_PARAMS)
    latest_post_params['subreddit'] = subreddit
    latest_post_params['sort'] = 'desc'
    response = safe_request(PUSHSHIFT_BASE_URL, latest_post_params)
    latest_post_timestamp = response.json()['data'][0]['created_utc']
    print(f'Subreddit: {subreddit}, latest_post_timestamp: {latest_post_timestamp}')

    # Get our evenly spaced period start epochs
    # Adding 1 and then skipping the last element because we can't actually use the latest_post_timestamp as a start epoch
    period_start_epochs = np.linspace(first_post_timestamp, latest_post_timestamp, NUM_EPOCH_PERIODS + 1)[:-1]

    for index, start_epoch in enumerate(period_start_epochs):
        
        period_metadata_params = copy.deepcopy(METADATA_BASE_PARAMS)
        period_metadata_params['subreddit'] = subreddit
        period_metadata_params['after'] = int(start_epoch) # have to do this to get rid of decimal troubles
        next_epoch = int(period_start_epochs[index + 1]) if index < NUM_EPOCH_PERIODS - 1 else 0
        if next_epoch > 0:
            period_metadata_params['before'] = next_epoch
        response = safe_request(PUSHSHIFT_BASE_URL, period_metadata_params)
        period_num_posts = response.json()['metadata']['total_results']

        period_data = {'period': index + 1, 'start_epoch': int(start_epoch), 'next_epoch': int(next_epoch), 'num_posts': period_num_posts, 'total_posts': lifetime_num_posts}
        results.append(period_data)
        print(f'Subreddit: {subreddit}, period_data: {period_data}')

    return results


In [79]:
# The big retrieve function - takes a list of subreddits, ensures that each has sufficient posts to be analyzed,
# Gets an evenly spaced distribution of Unix epoch times across the lifetime of the subreddit
# Retrieves a batch of posts from each of those periods
# Combines all and returns a DataFrame

def retrieve_posts(subreddits):
    # Make sure we got a list of subreddits
    assert type(subreddits) == list

    # Our final results
    results = pd.DataFrame()

    for subreddit in subreddits:
        subreddit_total = 0

        for period_data in retrieve_period_metadata(subreddit):
            retrieve_params = copy.deepcopy(RETRIEVE_SUBS_BASE_PARAMS)
            retrieve_params['subreddit'] = subreddit

            # Retrieve a proportional number of submissions per period
            period_num_to_retrieve = int(np.ceil(DESIRED_POSTS_PER_SUBREDDIT * period_data['num_posts'] / period_data['total_posts']))
            
            epoch_start = int(period_data['start_epoch'])

            # let's start at the very beginning....
            retrieve_params['after'] = epoch_start

            period_total = 0
            period_num_left_to_retrieve = period_num_to_retrieve
            chunk_counter = 1

            # time to get chunkin'
            while period_num_left_to_retrieve > 0:
                chunk_size_to_retrieve = min(period_num_left_to_retrieve, MAX_SUBS_PER_REQUEST)
                retrieve_params['size'] = chunk_size_to_retrieve

                # our 'after' is already set, but we need to set a 'before'
                # if we've hit the last chunk, then the 'before' is
                # next_epoch unless we're on the last period, in which case we don't need it at all
                if chunk_size_to_retrieve < MAX_SUBS_PER_REQUEST and period_data['next_epoch'] > 0:
                    retrieve_params['before'] = int(period_data['next_epoch'])

                print(f'Subreddit: {subreddit}, retrieving {chunk_size_to_retrieve} rows for epoch beginning {epoch_start} - chunk {chunk_counter}')
                response = safe_request(PUSHSHIFT_BASE_URL, retrieve_params)
                data = response.json()['data']
                rowcount = len(data)
                period_total += rowcount
                subreddit_total += rowcount
                print(f'Got {rowcount} rows for subreddit {subreddit}, total so far is {period_total}/{period_num_to_retrieve} for period, {subreddit_total}/{DESIRED_POSTS_PER_SUBREDDIT} for subreddit')

                period_df = pd.DataFrame(data)
                period_df['period'] = period_data['period']
                results = pd.concat([results, period_df])

                period_num_left_to_retrieve -= chunk_size_to_retrieve

                # now we need to set our 'after' for the next iteration
                # if we finished the last chunk for this period, then it's the start of the next period
                # otherwise, let's make it the latest post we have + 1
                if period_num_left_to_retrieve <= 0:
                    retrieve_params['after'] = int(period_data['next_epoch'])
                else:
                    retrieve_params['after'] = int(results.iloc[-1:]['created_utc']) + 1

                chunk_counter += 1
                    
    return results

In [82]:
# ok let's do it
df = retrieve_posts(subreddits)

df.to_csv('../data/raw_posts.csv', index = False)

Subreddit: jazz, lifetime_num_posts: 20998
Subreddit: jazz, first_post_timestamp: 1221829383
Subreddit: jazz, latest_post_timestamp: 1665033449
Subreddit: jazz, period_data: {'period': 1, 'start_epoch': 1221829383, 'next_epoch': 1243989586, 'num_posts': 11, 'total_posts': 20998}
Subreddit: jazz, period_data: {'period': 2, 'start_epoch': 1243989586, 'next_epoch': 1266149789, 'num_posts': 20, 'total_posts': 20998}
Subreddit: jazz, period_data: {'period': 3, 'start_epoch': 1266149789, 'next_epoch': 1288309992, 'num_posts': 137, 'total_posts': 20998}
Subreddit: jazz, period_data: {'period': 4, 'start_epoch': 1288309992, 'next_epoch': 1310470196, 'num_posts': 64, 'total_posts': 20998}
Subreddit: jazz, period_data: {'period': 5, 'start_epoch': 1310470196, 'next_epoch': 1332630399, 'num_posts': 0, 'total_posts': 20998}
Subreddit: jazz, period_data: {'period': 6, 'start_epoch': 1332630399, 'next_epoch': 1354790602, 'num_posts': 0, 'total_posts': 20998}
Subreddit: jazz, period_data: {'period': 