## Imports

In [103]:
import pandas as pd
import datetime as dt
import time
import requests

## Set ups

In [None]:
url = "https://api.pushshift.io/reddit/search/submission?subreddit=boardgames"

In [None]:
res = requests.get(url)

In [None]:
res.status_code

In [None]:
assert res.status_code == 200

In [None]:
json_data = res.json()

In [27]:
def query_pushshift(subreddit, kind = 'submission', day_window = 1, n = 400):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&is_video=false&size=500" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

## Collecting data from r/wallstreetbets

In [28]:
df_wsb = query_pushshift(subreddit = 'wallstreetbets')

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=1d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=2d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=3d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=4d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=5d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=6d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=7d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=8d
Querying from: h

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=68d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=69d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=70d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=71d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=72d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=73d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=74d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=75d
Querying

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=134d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=135d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=136d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=137d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=138d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=139d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=140d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=141d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=200d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=201d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=202d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=203d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=204d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=205d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=206d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=207d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=266d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=267d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=268d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=269d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=270d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=271d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=272d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=273d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=332d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=333d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=334d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=335d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=336d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=337d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=338d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=339d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=398d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=399d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&is_video=false&size=500&after=400d
Query Complete!


In [29]:
df_wsb.shape

(22201, 9)

In [57]:
df_wsb.drop(columns = ['is_self'], inplace=True)

In [69]:
df_wsb.isna().sum()

title             0
selftext        411
subreddit         0
created_utc       0
author            0
num_comments      0
score             0
timestamp         0
dtype: int64

In [70]:
df_wsb.dropna(inplace=True)

### Cleaning up data some

I dropped the null values because there were not very many in comparison to the number of total documents.

In [30]:
df_wsb.shape[0]*.6

13320.6

In [75]:
df_wsb = df_wsb[df_wsb['selftext'] != '[removed]']

### Cleaning up data some

About 40% or so of all 'selftext' values were ['removed']. I believe this is either a text that was removed or a reddit user that deleted the account. To make up for it I downloaded extra documents and eliminated the 'removed' values.

In [76]:
df_wsb.to_csv('./data/wsb.csv')

## Collecting data from r/investing

In [41]:
df_investing = query_pushshift(subreddit = 'investing', day_window = 1, n = 400)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=1d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=2d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=3d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=4d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=5d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=6d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=7d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=8d
Querying from: https://api.pushshift.io/reddit/search/su

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=70d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=71d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=72d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=73d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=74d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=75d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=76d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=77d
Querying from: https://api.pushshift.io/reddit/s

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=139d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=140d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=141d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=142d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=143d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=144d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=145d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=146d
Querying from: https://api.pushshift.io/

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=208d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=209d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=210d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=211d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=212d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=213d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=214d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=215d
Querying from: https://api.pushshift.io/

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=277d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=278d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=279d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=280d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=281d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=282d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=283d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=284d
Querying from: https://api.pushshift.io/

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=346d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=347d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=348d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=349d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=350d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=351d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=352d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=investing&is_video=false&size=500&after=353d
Querying from: https://api.pushshift.io/

In [42]:
df_investing.shape

(38931, 9)

In [51]:
df_investing.isna().sum()

title               0
selftext        10863
subreddit           0
created_utc         0
author              0
num_comments        0
score               0
is_self             0
timestamp           0
dtype: int64

In [58]:
df_investing.drop(columns = ['is_self'], inplace=True)

In [59]:
df_investing.isna().sum()

title             0
selftext        464
subreddit         0
created_utc       0
author            0
num_comments      0
score             0
timestamp         0
dtype: int64

In [60]:
df_investing.dropna(inplace=True)

### Cleaning up data nulls

I dropped the null values because there were not very many in comparison to the number of total documents.

In [67]:
df_investing = df_investing[df_investing['selftext'] != '[removed]']

### Cleaning up data with removed text

About 40% or so of all 'selftext' values were ['removed']. I believe this is either a text that was removed or a reddit user that deleted the account. To make up for it I downloaded extra documents and eliminated the 'removed' values.

In [68]:
df_investing.to_csv('./data/investing.csv')

In [87]:
df_wsb.shape[0]

12644

In [82]:
df_wsb.shape[0]/(df_investing.shape[0] + df_wsb.shape[0])

0.31057182157594815

In [83]:
df_investing['post_word_count'] = df_investing['selftext'].str.count(' ') + 1
df_wsb['post_word_count'] = df_wsb['selftext'].str.count(' ') + 1

In [85]:
df_wsb['post_word_count'].describe()

count    12644.000000
mean       114.833676
std        225.848288
min          1.000000
25%         29.000000
50%         57.000000
75%        119.000000
max       6748.000000
Name: post_word_count, dtype: float64

In [86]:
df_investing['post_word_count'].describe()

count    28068.000000
mean       117.369389
std        188.721193
min          1.000000
25%         43.000000
50%         73.000000
75%        128.000000
max       4894.000000
Name: post_word_count, dtype: float64

I am not sure how much strictly analyzing based on word count in a post will help in classification... The mean word counts are pretty much the same; however the wallstreetbets subreddit does have more variation in size.

### There are about double the number of documents in r/investing. After cleaning the data more and before any modelling, I will remove rows as necessary to make the ratio closer to 50-50 and save the combined data set as a csv.

In [95]:
df_investing_trunc = df_investing.iloc[0:12644]

In [96]:
df_combined = pd.concat([df_investing_trunc, df_wsb], axis=0)

In [100]:
df_combined.reset_index(inplace=True, drop=True)

In [105]:
df_combined.to_csv('./data/combined.csv')