# API Function # 

In [4]:
import requests
import datetime
import time
import pandas as pd
import math

## Function that pulls down posts from a specific subreddit ## 
- Subreddit: which subreddit to pull posts from
- num_posts: the number of posts to pull down
- date: when you would like to pull posts from

In [5]:
url =  'https://api.pushshift.io/reddit/search/submission'
#comment_url = 'https://api.pushshift.io/reddit/search/comment'
def get_posts(subreddit, num_posts, date):
    initial_timestamp = math.floor(date.timestamp())
    if num_posts > 100:
        size = 100
    else:
        size = num_posts
    
    params = { 'subreddit': subreddit, 'size': size, 'before': initial_timestamp }
    print('Fetching first batch of posts from r/'+subreddit)
    results = requests.get(url, params)
    data = results.json()
    posts = data['data']
    remaining_posts = num_posts-len(posts)
    while remaining_posts > 0:
        time.sleep(1) # 1 second sleep seems to be sufficient to avoid 429 responses
        if remaining_posts > 100:
            size = 100
        else:
            size = remaining_posts
        last_post_time = posts[len(posts)-1]['created_utc']
        
        params['before'] = last_post_time
        params['size'] = size
        
        print('Fetching posts '+str(len(posts)+1)+'-'+str(len(posts)+size)+' from r/'+subreddit)
        results = requests.get(url, params)
        try:
            data = results.json()
        except:
            print(results)
        loop_posts = data['data']
        for i in range(len(loop_posts)):
            posts.append(loop_posts[i])
        remaining_posts = num_posts-len(posts)
    print('Done fetching posts from r/'+subreddit)    
    return posts

## Pull Down Posts: ##
- can specify from which subreddit, how many posts, and from which point in time
- I chose 1500 posts each and used current posts

In [6]:
baseball_posts = get_posts('baseball', 1500, datetime.datetime.now())
hockey_posts = get_posts('hockey', 1500, datetime.datetime.now())
nba_posts = get_posts('nba', 1500, datetime.datetime.now())
nfl_posts = get_posts('nfl', 1500, datetime.datetime.now())
soccer_posts = get_posts('soccer', 1500, datetime.datetime.now())

Fetching first batch of posts from r/baseball
Fetching posts 101-200 from r/baseball
Fetching posts 201-300 from r/baseball
Fetching posts 301-400 from r/baseball
Fetching posts 401-500 from r/baseball
Fetching posts 501-600 from r/baseball
Fetching posts 601-700 from r/baseball
Fetching posts 701-800 from r/baseball
Fetching posts 801-900 from r/baseball
Fetching posts 901-1000 from r/baseball
Fetching posts 1001-1100 from r/baseball
Fetching posts 1101-1200 from r/baseball
Fetching posts 1201-1300 from r/baseball
Fetching posts 1301-1400 from r/baseball
Fetching posts 1401-1500 from r/baseball
Done fetching posts from r/baseball
Fetching first batch of posts from r/hockey
Fetching posts 101-200 from r/hockey
Fetching posts 201-300 from r/hockey
Fetching posts 301-400 from r/hockey
Fetching posts 401-500 from r/hockey
Fetching posts 501-600 from r/hockey
Fetching posts 601-700 from r/hockey
Fetching posts 701-800 from r/hockey
Fetching posts 801-900 from r/hockey
Fetching posts 901-10

## Create and Export DataFrames: ##

In [7]:
baseball_df = pd.DataFrame(baseball_posts)
hockey_df = pd.DataFrame(hockey_posts)
nba_df = pd.DataFrame(nba_posts)
nfl_df = pd.DataFrame(nfl_posts)
soccer_df = pd.DataFrame(soccer_posts)

In [8]:
nba_df.to_csv('./data/nba.csv')
baseball_df.to_csv('./data/baseball.csv')
nfl_df.to_csv('./data/nfl.csv')
soccer_df.to_csv('./data/soccer.csv')
hockey_df.to_csv('./data/hockey.csv')