In [37]:
import requests
import pandas as pd
import numpy as np

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
from bs4 import BeautifulSoup
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import time

In [38]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [39]:
params = {
    'subreddit':'wallstreetbets',
    'size' : 500,
    'before':  1616724935,    
}

In [40]:
res = requests.get(url, params)

In [41]:
res.status_code

200

In [42]:
data = res.json()

In [43]:
posts = data['data']

In [44]:
len(posts)

100

In [45]:
df = pd.DataFrame(posts)

In [46]:
df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subr

In [47]:
df.head(1)

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,media,media_embed,secure_media,secure_media_embed,author_flair_background_color,author_flair_text_color,removed_by_category,media_metadata,gallery_data,is_gallery
0,[],False,parkjihyun619,,[],,text,t2_9yt3qwhs,False,False,...,,,,,,,,,,


In [48]:
df.sort_values(by = 'retrieved_on',ascending=True).head()[['created_utc']]


Unnamed: 0,created_utc
99,1615999835
98,1615999842
97,1615999853
96,1615999856
95,1615999858


In [49]:
df[['subreddit','selftext','title','created_utc']].head()

Unnamed: 0,subreddit,selftext,title,created_utc
0,wallstreetbets,,"added 10 shares of GME, let’s go!!! 🚀🚀🚀",1616001381
1,wallstreetbets,,"AMC tweet - keep your friends close, but your ...",1616001376
2,wallstreetbets,,Dfv tweet - if we wanna make money we need to ...,1616001366
3,wallstreetbets,,Rocket Boosters Initiated 🔥🔥🚀🚀,1616001350
4,wallstreetbets,,Did I stutter?,1616001346


In [50]:
# from Breakfast Hour with help from Erik
def get_posts(subreddit, n_iter, epoch_right_now): # subreddit name and number of times function should run
    pass
    # store base url variable
    base_url = 'https://api.pushshift.io/reddit/search/submission/?subreddit='
    # instantiate empty list    
    df_list = []
    # save current epoch, used to iterate in reverse through time
    current_time = epoch_right_now
    # set up for loop
    for post in range(n_iter):
        # instantiate get request
        res = requests.get(
            # requests.get takes base_url and params
            base_url, 
            # parameters for get request
            params = {
                # specify subreddit
                'subreddit': subreddit,
                # specify number of posts to pull
                'size': 500,
                # restrict based on default language of subreddit
                'lang': True,
                # pull everything from current time backward
                'before': current_time}
        )
        # take data from most recent request, store as df
        df = pd.DataFrame(res.json()['data'])
        # pull specific columns from dataframe for analysis
        df = df.loc[:, ['title',
                        'author',
                        'selftext',
                        'subreddit',                 
                        'score',
                        'created_utc',
                        'id']]
        # append to empty dataframe list
        df_list.append(df)
        # add wait time to not overload the API
        sleep_time = np.random.randint(1, 3) # to make it look more random
        time.sleep(sleep_time)
        # set current time counter back to last epoch in recently grabbed df
        current_time = df['created_utc'].min()     # current time set to before the last post in current request
    # return one dataframe for all requests
    return pd.concat(df_list, axis = 0)

In [51]:
wsb1 = get_posts('wallstreetbets', 20, 1616724341)

In [52]:
wsb1.head()

Unnamed: 0,title,author,selftext,subreddit,score,created_utc,id
0,"added 10 shares of GME, let’s go!!! 🚀🚀🚀",parkjihyun619,,wallstreetbets,1,1616001381,m75e9z
1,"AMC tweet - keep your friends close, but your ...",TurdsforNipples,,wallstreetbets,1,1616001376,m75e6z
2,Dfv tweet - if we wanna make money we need to ...,Xientziunjo,,wallstreetbets,1,1616001366,m75e1r
3,Rocket Boosters Initiated 🔥🔥🚀🚀,Aaronlvlia,,wallstreetbets,0,1616001350,m75dtv
4,Did I stutter?,chiefoogabooga,,wallstreetbets,1,1616001346,m75drr


In [53]:
wsb1.shape

(2000, 7)

In [54]:
wsb1 = wsb1.dropna() 
wsb1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        2000 non-null   object
 1   author       2000 non-null   object
 2   selftext     2000 non-null   object
 3   subreddit    2000 non-null   object
 4   score        2000 non-null   int64 
 5   created_utc  2000 non-null   int64 
 6   id           2000 non-null   object
dtypes: int64(2), object(5)
memory usage: 125.0+ KB


In [55]:
# cleaning up some of the noise
wsb1['selftext'] = wsb1['selftext'].apply(lambda x: re.sub('http[s]?:\/\/[^\s]*', ' ', x))
wsb1['selftext'] = wsb1['selftext'].apply(lambda x: re.sub('\n', ' ', x))

In [56]:
wsb1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        2000 non-null   object
 1   author       2000 non-null   object
 2   selftext     2000 non-null   object
 3   subreddit    2000 non-null   object
 4   score        2000 non-null   int64 
 5   created_utc  2000 non-null   int64 
 6   id           2000 non-null   object
dtypes: int64(2), object(5)
memory usage: 125.0+ KB


In [57]:
wsb1.sort_values(by = ['selftext'])

Unnamed: 0,title,author,selftext,subreddit,score,created_utc,id
0,"added 10 shares of GME, let’s go!!! 🚀🚀🚀",parkjihyun619,,wallstreetbets,1,1616001381,m75e9z
50,I'm waiting for the gain when the market is in...,TrumXReddit,,wallstreetbets,1,1615983616,m6z5ht
48,"I got lost at ""Good Robinhood Trader""",thenribrat,,wallstreetbets,1,1615983633,m6z5n6
45,This old fart is part of the reason why we sho...,Omegaque713,,wallstreetbets,1,1615983701,m6z6c9
44,5 Irish Stocks To Buy For St. Patrick's Day,PhilHallUSA,,wallstreetbets,1,1615983716,m6z6gz
...,...,...,...,...,...,...,...
28,WSB Rules - Please Read Before Posting (Smooth...,OPINION_IS_UNPOPULAR,post gooder please remove tinfoil hat make t...,wallstreetbets,1,1615999520,m74o97
56,next target ISR,Awkward_Dig4434,so our next target is - ISR ... true ?,wallstreetbets,1,1615976615,m6xbce
55,next target - ISR,Awkward_Dig4434,so our next target is ISR ? its true ?,wallstreetbets,1,1615976672,m6xbvw
81,decrypting dfv's latest tweet,nin0miku,"🔍Latest DFV tweet decoded 🔎 So, [DFV tweeded]...",wallstreetbets,1,1615969630,m6vo63


In [58]:
wsb1.loc[wsb1['selftext']=='']

Unnamed: 0,title,author,selftext,subreddit,score,created_utc,id
0,"added 10 shares of GME, let’s go!!! 🚀🚀🚀",parkjihyun619,,wallstreetbets,1,1616001381,m75e9z
1,"AMC tweet - keep your friends close, but your ...",TurdsforNipples,,wallstreetbets,1,1616001376,m75e6z
2,Dfv tweet - if we wanna make money we need to ...,Xientziunjo,,wallstreetbets,1,1616001366,m75e1r
3,Rocket Boosters Initiated 🔥🔥🚀🚀,Aaronlvlia,,wallstreetbets,0,1616001350,m75dtv
4,Did I stutter?,chiefoogabooga,,wallstreetbets,1,1616001346,m75drr
...,...,...,...,...,...,...,...
92,I've been out of bubble gum for a while now. T...,SeSuSo,,wallstreetbets,1,1615949598,m6qfq9
93,Funko NFT,rockettilt,,wallstreetbets,1,1615949597,m6qfq0
94,Which one of you smooth-brains did this?? Spot...,sd_aero,,wallstreetbets,1,1615949592,m6qfnw
97,Link in first comment,ElonMuks-earth2,,wallstreetbets,1,1615949586,m6qfjf


In [59]:
wsb1.loc[wsb1['selftext']=='[removed]'].count()

title          424
author         424
selftext       424
subreddit      424
score          424
created_utc    424
id             424
dtype: int64

In [60]:
#removing empty post rows
wsb1['selftext'].replace("", np.nan, inplace=True)
wsb1['selftext'].replace("[removed]", np.nan, inplace=True)

In [61]:
wsb1 = wsb1.dropna()
wsb1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 410 entries, 15 to 95
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        410 non-null    object
 1   author       410 non-null    object
 2   selftext     410 non-null    object
 3   subreddit    410 non-null    object
 4   score        410 non-null    int64 
 5   created_utc  410 non-null    int64 
 6   id           410 non-null    object
dtypes: int64(2), object(5)
memory usage: 25.6+ KB


In [62]:
wsb1.shape

(410, 7)

In [63]:
# will use as cleaned main data source
wsb1.to_csv('wsb_scrape.csv')