# Reddit API Data Scraping
---

In this notebook, I utilize Reddit's built in API .json functionality to scrape post data from four subreddits. I then export this data into .csv files to use in my analysis notebook. 

My chosen subreddits are as follows:

- r/nba
- r/nfl
- r/cfb
- r/CollegeBasketball

I have taken mostly new posts from the subreddits, but I have also supplemented this with the top 500 posts from the past year into each dataset.

In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import requests, re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# update pandas global settings to view all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
# import existing subreddit data
nba_df = pd.read_csv('/content/drive/MyDrive/IR_Project/nba_subreddit_data.csv')

In [None]:
# check shape of dataframes
print(nba_df.shape)

(3011, 114)


In [None]:
# enter subreddit urls
nba_url = 'https://www.reddit.com/r/nba.json'
# establish our header
header = {'User-agent': 'subreddit get requests'}

In [None]:
# initial get request to test API
res = requests.get(nba_url, headers=header)
nba_res = res.json()

In [None]:
# check request status
res.status_code

200

In [None]:
# explore keys for test request
nba_res['data']['children'][0]['data'].keys()

dict_keys(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'thumbnail_height', 'top_awarded_type', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'upvote_ratio', 'author_flair_background_color', 'subreddit_type', 'ups', 'total_awards_received', 'media_embed', 'thumbnail_width', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'score', 'approved_by', 'is_created_from_ads_ui', 'author_premium', 'thumbnail', 'edited', 'author_flair_css_class', 'author_flair_richtext', 'gildings', 'content_categories', 'is_self', 'mod_note', 'created', 'link_flair_type', 'wls', 'removed_by_category', 'banned_by', 'author_flair_type', 'domain', 'allow_live_comments', 'selftext_html', 

In [None]:
# define function to get num pages of posts from a subreddit, start collecting at a defined after
def reddit_scraper(url, num, after = None):
    posts = []
    # loop through the num pages, each subreddit .json returns 25 posts 
    for page in range(num):
        # initiate params modifier for posts if there no defined after
        if after == None:
            params = {}
        # add in after id for each loop following to ensure no duplicate posts
        else:
            params = {'after': after}
        # call our get request for the posts
        res = requests.get(url, params=params, headers=header)
        # check status code, 200 means posts were successfully downloaded
        if res.status_code == 200:
            # convert request to .json
            new_json = res.json()
            # extend list from the 'children' dictionary for each request
            posts.extend(new_json['data']['children'])
            # update after id
            after = new_json['data']['after']
        else:
            # print status code if not 200
            print(res.status_code)
            break
        # wait 1 second
        time.sleep(1)
        
    # create a new dataframe with the 'data' from each post
    new_df = pd.DataFrame([post['data'] for post in posts])
    
    # print final value of after
    print(f'Final value of after parameter: {after}')
    
    # return the dataframe
    return new_df

## Data from r/nba
---

In [None]:
# call subreddit scraping function
new_nba_df = reddit_scraper(nba_url, 10)

Final value of after parameter: t3_r7q16a


In [None]:
# check shape of scraped dataframe
new_nba_df.shape

(252, 112)

In [None]:
new_nba_df = pd.concat([nba_df, new_nba_df], axis=0, sort=True)

In [None]:
# confirm concatenation
new_nba_df.shape

(3011, 114)

In [None]:
# reset index
new_nba_df.reset_index(drop=True, inplace=True)

In [None]:
# count number of unique posts
new_nba_df['name'].nunique()

2314

In [None]:
# export CSV of original and new data
new_nba_df.to_csv("/content/drive/MyDrive/IR_Project/nba_subreddit_data.csv", index=False)
nba_df.to_csv("/content/drive/MyDrive/IR_Project/nba_subreddit_data - backup.csv", index=False)

In [None]:
nba_df = pd.read_csv('/content/drive/MyDrive/IR_Project/nba_subreddit_data.csv')

In [None]:
def combine_text(df, cols):
    for col in cols:
        df[col] = df[col].fillna(value = "")
    return df

In [None]:
nba_df = combine_text(nba_df, [['title', 'selftext']])

In [None]:
# reshape dataframes to only include text, name, and subreddit columns
nba_df = nba_df[['title', 'selftext','name']]
# drop duplicate posts based on the post name ID
nba_df = nba_df.drop_duplicates(subset=['name'])

# define function to run same regex over a dataframe
def sub_preprocess(sub):
    # run regex to remove urls
    sub['selftext'] = sub['selftext'].map(lambda x: re.sub(r"((http|ftp|https):\/\/)?[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?", ' ', x))

    # run regex to remove certain characters
    sub['selftext'] = sub['selftext'].map(lambda x: re.sub(r'[^\w^\s^-^\$]',' ',x))
    sub['selftext'] = sub['selftext'].map(lambda x: re.sub(r"[@\?\.%_\[\]()+-:*\"]", ' ', x, flags=re.I))
    sub['selftext'] = sub['selftext'].map(lambda x: re.sub(r"[,']", '', x, flags=re.I))
    sub['selftext'] = sub['selftext'].map(lambda x: re.sub("(?<![\w'])\w+?(?=\b|'s)", ' ', x))

    # run regex to remove line breaks and tabs
    sub['selftext'] = sub['selftext'].map(lambda x: re.sub(r"\s+", ' ', x))

    # run regex to remove common words
    sub['selftext'] = sub['selftext'].map(lambda x: re.sub('(nfl|nba|college|ncaa|team|game|season|year|player|thread|just|like|time|new|s)[s]?', ' ', x,  flags=re.I))

sub_preprocess(nba_df)

In [None]:
nba_df.head()

Unnamed: 0,title,selftext,name
0,Game Threads Index + Daily Discussion (July 04...,Today Tip off Away Home GDT PGT Ye terday ...,t3_c93mdd
1,2019 NBA Free Agent Tracker,After hock Day Contract ource Old Kawhi Leon...,t3_c6ffge
2,[Haynes] Free agent guard Quinn Cook has reach...,,t3_c9trsl
3,[Wojnarowski] Presti pursued a package of Russ...,,t3_c9tebd
4,"(Shelburne in ESPN piece) Still, Leonard's rec...",Another intere ting tidbit Then Durant got a ...,t3_c9tph1


In [None]:
nba_df.to_csv("/content/drive/MyDrive/IR_Project/nba_subreddit_data_prep.csv", index=False)