In [1]:
pwd!

'/Users/ch/Desktop/GA/Projects/project_6_capstone'

# IMPORTS

In [26]:
import pandas as pd
import datetime as dt
import time
import requests
pd.set_option('display.max_columns', None) # Displays all columns
import numpy as np

import os

In [23]:
%%time
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re # Python's package for RegEx (Regular Expressions)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text

CPU times: user 701 ms, sys: 293 ms, total: 994 ms
Wall time: 1.78 s


In [3]:
start=time.time()
time.sleep(1)
current=time.time()
current - start

1.0020661354064941

# FUNCTIONS

In [178]:
def query_pushshift_submissions(subreddit, kind, day_window, n):
    
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    # could also use request.getparams
    stem = f"https://api.pushshift.io/reddit/search/submission?subreddit={subreddit}&size=100" # also known as the "API endpoint" 
    #stem = f"{BASE_URL}?subreddit={subreddit}&size=100" # always pulling max of 100
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(1)`
    for i in range(1, n + 1):
        URL = "{}&after={}d".format(stem, day_window * i)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        #print(df.shape)
        time.sleep(1.5)
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]
        
    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full.reset_index(drop=True)

In [27]:
def substituteincolumn(data, column): # Handles strange observations
        data[column].replace('[deleted]', np.nan, inplace = True)
        data[column].replace('[removed]', np.nan, inplace = True)

In [18]:
def clean_strings(input_list, stopwords = []):
    import re # we'll use regex to strip urls
    output_list = [] # create output list
    stopwords = [word.lower() for word in stopwords] # ensure case insensitivity for stopwords
    for sentence in input_list:
        sentence = re.sub(r'^https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE) # remove URLS per stackoverflow
        sentence = sentence.replace('https', '')
        sentence = sentence.replace('\n',' ') # replace \n and \t with spaces as they represent breaks between words
        sentence = sentence.replace('\t',' ')
        sentence = ''.join([letter for letter in sentence if letter.isalpha() or letter == ' ']) #remove numbers and punctuation
        sentence = ' '.join([word.lower() for word in sentence.split() if word.lower() not in stopwords]) #coerce to lowercase while removing stopwords
        output_list.append(sentence) # add to the output list
    return output_list

In [24]:
custom = ['just','like','com','do','https','really']
combined_words = text.ENGLISH_STOP_WORDS.union(custom)

# MINING FROM:
- https://www.reddit.com/r/investing/ Created Mar 15, 2008
- https://www.reddit.com/r/GME/ Created May 30, 2012
- https://www.reddit.com/r/wallstreetbets/ Created Jan 31, 2012
- https://www.reddit.com/r/SecurityAnalysis/ Created Dec 8, 2010

Shouldn't mine before 2012

# BEGIN MINING

# Investing Submissions

# ```https://www.reddit.com/r/investing/```

In [None]:
# investinggamestopsubmissions
investingsubsmissions=query_pushshift_submissions(subreddit='investing', kind='submission', day_window=1, n=3000)

### Check quality of data before exporting to csv

In [15]:
investingsubsmissions.shape

(186768, 9)

In [16]:
investingsubsmissions.isnull().sum()

title              0
selftext        1195
subreddit          0
created_utc        0
author             0
num_comments       0
score              0
is_self            0
timestamp          0
dtype: int64

In [28]:
substituteincolumn(investingsubsmissions, 'selftext')
substituteincolumn(investingsubsmissions, 'author')

In [29]:
investingsubsmissions.isnull().sum()

title               0
selftext        31880
subreddit           0
created_utc         0
author          17067
num_comments        0
score               0
is_self             0
timestamp           0
dtype: int64

In [30]:
investingsubsmissions.dropna(inplace=True)

In [31]:
investingsubsmissions.isnull().sum()

title           0
selftext        0
subreddit       0
created_utc     0
author          0
num_comments    0
score           0
is_self         0
timestamp       0
dtype: int64

In [32]:
investingsubsmissions.shape

(146053, 9)

In [34]:
investingsubsmissions.tail(1)

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
186767,ULIP,ULIP Plans – Get best ULIP plans in India. Com...,investing,1356679375,policybazaar9,0,0,True,2012-12-27


In [35]:
# Create new column called clean text with clean_strings function applied to selftext
investingsubsmissions['cleantext'] = clean_strings(investingsubsmissions['selftext'], stopwords=combined_words)

In [36]:
# Add column with length of selftext
investingsubsmissions['selftextwordcount'] = investingsubsmissions['selftext'].str.split().str.len()

In [41]:
# Save data set to csv in data folder
investingsubsmissions.to_csv('./data/investingsubmissions.csv', index = False)

# GME Submissions

# ```https://www.reddit.com/r/GME/```

In [77]:
%%time
# investinggamestopsubmissions
GMEsubmissions=query_pushshift_submissions(subreddit='GME', kind='submission', day_window=2, n=1500)
# should take 35 minutes
# took 1.75 hours
# day window 2 / n = 1500 iterations took 1.5 hours / each iteration took 6 seconds


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=4d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=6d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=8d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=10d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=12d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=14d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=16d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=18d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=20d
Querying from: https://a

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=170d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=172d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=174d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=176d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=178d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=180d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=182d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=184d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=186d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=188d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=336d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=338d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=340d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=342d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=344d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=346d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=348d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=350d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=352d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=354d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=502d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=504d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=506d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=508d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=510d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=512d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=514d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=516d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=518d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=520d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=668d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=670d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=672d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=674d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=676d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=678d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=680d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=682d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=684d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=686d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=834d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=836d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=838d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=840d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=842d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=844d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=846d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=848d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=850d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=852d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1002d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1004d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1006d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1008d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1010d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1012d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1014d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1016d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1018d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1164d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1166d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1168d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1170d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1172d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1174d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1176d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1178d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1180d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1182d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1328d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1330d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1332d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1334d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1336d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1338d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1340d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1342d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1344d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1346d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1492d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1494d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1496d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1498d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1502d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1504d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1506d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1508d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1510d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1656d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1658d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1660d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1662d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1664d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1666d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1668d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1670d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1672d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1674d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1820d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1822d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1824d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1826d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1828d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1830d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1832d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1834d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1836d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1838d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1984d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1986d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1988d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1990d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1992d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1994d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1996d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=1998d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2000d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2002d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2148d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2150d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2152d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2154d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2156d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2158d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2160d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2162d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2164d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2166d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2312d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2314d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2316d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2318d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2320d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2322d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2324d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2326d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2328d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2330d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2476d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2478d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2480d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2482d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2484d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2486d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2488d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2490d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2492d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2494d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2640d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2642d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2644d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2646d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2648d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2650d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2652d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2654d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2656d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2658d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2804d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2806d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2808d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2810d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2812d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2814d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2816d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2818d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2820d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2822d


Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2968d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2970d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2972d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2974d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2976d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2978d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2980d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2982d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2984d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=GME&size=100&after=2986d


In [78]:
GMEsubmissions.shape

(1659, 9)

In [79]:
# Save data set to csv in data folder
GMEsubmissions.to_csv('./data/GMEsubmissions.csv', index = False)

### Check quality of data before exporting to csv

In [81]:
GMEsubmissions = pd.read_csv('./Data/GMEsubmissions.csv')

In [82]:
GMEsubmissions.shape

(1659, 9)

In [83]:
GMEsubmissions.isnull().sum()

title             0
selftext        227
subreddit         0
created_utc       0
author            0
num_comments      0
score             0
is_self           0
timestamp         0
dtype: int64

In [84]:
substituteincolumn(GMEsubmissions, 'selftext')
substituteincolumn(GMEsubmissions, 'author')

In [85]:
GMEsubmissions.isnull().sum()

title             0
selftext        527
subreddit         0
created_utc       0
author           68
num_comments      0
score             0
is_self           0
timestamp         0
dtype: int64

In [86]:
GMEsubmissions.dropna(inplace=True)

In [87]:
# Create new column called clean text with clean_strings function applied to selftext
GMEsubmissions['cleantext'] = clean_strings(GMEsubmissions['selftext'], stopwords=combined_words)

In [88]:
# Add column with length of selftext
GMEsubmissions['selftextwordcount'] = GMEsubmissions['selftext'].str.split().str.len()

In [89]:
GMEsubmissions.isnull().sum()

title                0
selftext             0
subreddit            0
created_utc          0
author               0
num_comments         0
score                0
is_self              0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [92]:
# Save data set to csv in data folder
GMEsubmissions.to_csv('./data/GMEsubmissions.csv', index = False)

In [93]:
GMEsubmissions.shape

(1130, 11)

# WallStreetBets Submissions

# ```https://www.reddit.com/r/wallstreetbets/```

In [106]:
# investinggamestopsubmissions
wsbsubmissions=query_pushshift_submissions(subreddit='wallstreetbets', kind='submission', day_window=3, n=800)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=3d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=6d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=9d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=12d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=18d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=21d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=24d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=27d
Querying from: https:/

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=228d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=231d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=234d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=237d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=240d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=243d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=246d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=249d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=252d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=453d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=456d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=459d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=462d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=465d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=468d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=471d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=474d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=477d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=678d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=681d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=684d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=687d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=690d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=693d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=696d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=699d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=702d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=903d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=906d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=909d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=912d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=915d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=918d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=921d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=924d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=927d
Querying f

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1128d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1131d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1134d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1137d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1140d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1143d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1146d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1149d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1152d
Q

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1350d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1353d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1356d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1359d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1362d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1365d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1368d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1371d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1374d
Q

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1572d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1575d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1578d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1581d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1584d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1587d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1590d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1593d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1596d
Q

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1794d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1797d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1800d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1803d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1806d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1809d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1812d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1815d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=1818d
Q

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2016d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2019d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2022d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2025d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2028d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2031d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2034d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2037d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2040d
Q

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2238d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2241d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2244d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2247d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2250d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2253d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2256d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2259d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=wallstreetbets&size=100&after=2262d
Q

### Check quality of data before exporting to csv

In [107]:
wsbsubmissions.shape

(43125, 9)

In [108]:
wsbsubmissions.isnull().sum()

title             0
selftext        367
subreddit         0
created_utc       0
author            0
num_comments      0
score             0
is_self           0
timestamp         0
dtype: int64

In [113]:
wsbsubmissions.isnull().sum()

title           0
selftext        0
subreddit       0
created_utc     0
author          0
num_comments    0
score           0
is_self         0
timestamp       0
dtype: int64

In [115]:
wsbsubmissions.shape

(42758, 9)

In [116]:
substituteincolumn(wsbsubmissions, 'selftext')
substituteincolumn(wsbsubmissions, 'author')

In [117]:
wsbsubmissions.isnull().sum()

title               0
selftext        13116
subreddit           0
created_utc         0
author           4297
num_comments        0
score               0
is_self             0
timestamp           0
dtype: int64

In [123]:
wsbsubmissions=wsbsubmissions.dropna(axis=0,subset=['selftext','author'])

In [124]:
wsbsubmissions.isnull().sum()

title           0
selftext        0
subreddit       0
created_utc     0
author          0
num_comments    0
score           0
is_self         0
timestamp       0
dtype: int64

In [125]:
wsbsubmissions.shape

(28814, 9)

In [126]:
wsbsubmissions.isnull().sum()

title           0
selftext        0
subreddit       0
created_utc     0
author          0
num_comments    0
score           0
is_self         0
timestamp       0
dtype: int64

In [127]:
# Create new column called clean text with clean_strings function applied to selftext
wsbsubmissions['cleantext'] = clean_strings(wsbsubmissions['selftext'], stopwords=combined_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wsbsubmissions['cleantext'] = clean_strings(wsbsubmissions['selftext'], stopwords=combined_words)


In [128]:
# Add column with length of selftext
wsbsubmissions['selftextwordcount'] = wsbsubmissions['selftext'].str.split().str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wsbsubmissions['selftextwordcount'] = wsbsubmissions['selftext'].str.split().str.len()


In [411]:
wsbsubmissions.head(1)

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,cleantext,selftextwordcount
1,Gme📈📈📈📈📈📈📈📈📈📈,,wallstreetbets,Greedy-Obligation-36,2,1,2021-03-12,,0


In [None]:
# Consider Dropping Selftext before exporting to avoid issues

In [262]:
wsbsubmissions.isnull().sum()

title                0
selftext             0
subreddit            0
created_utc          0
author               0
num_comments         0
score                0
is_self              0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [133]:
# Save data set to csv in data folder
wsbsubmissions.to_csv('./data/wsbsubmissions.csv', index = False)

# Security Analysis Submissions

# ```https://www.reddit.com/r/SecurityAnalysis/```

In [179]:
%%time
securityanalysissubmissions = query_pushshift_submissions(subreddit='SecurityAnalysis', kind='submission', day_window=4, n=700)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=4d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=8d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=12d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=16d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=20d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=24d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=28d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=32d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=36d
Que

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=300d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=304d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=308d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=312d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=316d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=320d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=324d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=328d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&aft

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=596d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=600d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=604d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=608d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=612d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=616d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=620d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=624d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&aft

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=892d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=896d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=900d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=904d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=908d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=912d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=916d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=920d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&aft

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1184d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1188d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1192d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1196d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1200d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1204d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1208d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1212d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1476d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1480d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1484d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1488d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1492d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1496d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1500d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1504d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1768d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1772d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1776d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1780d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1784d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1788d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1792d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=1796d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2060d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2064d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2068d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2072d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2076d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2080d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2084d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2088d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2352d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2356d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2360d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2364d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2368d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2372d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2376d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2380d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2644d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2648d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2652d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2656d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2660d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2664d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2668d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size=100&after=2672d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=SecurityAnalysis&size

### Check quality of data before exporting to csv

In [181]:
securityanalysissubmissions.shape

(8472, 9)

In [182]:
securityanalysissubmissions.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,"Any success stories of ordinary, retail value ...",Wondering how effective value investing can be...,SecurityAnalysis,1615539130,Past_Sir,0,1,True,2021-03-12
1,Home Depot Write-Up,2-Page write up about Home Depot\n\n* If you a...,SecurityAnalysis,1615664231,requantify,10,9,True,2021-03-13
2,Please help interpret this passage:,It seems to me that Leggate prefers oil compan...,SecurityAnalysis,1615830873,rhetorical_twix,4,1,True,2021-03-15
3,DTCC Amendments Regarding Supplemental Liquidi...,Thought this was interesting enough to share. ...,SecurityAnalysis,1615248072,ilikepancakez,0,1,True,2021-03-08
4,300 Year Anniversary of the South Sea Bubble -...,http://renmac.com/Podcast-AndrewOdlyzko-012820...,SecurityAnalysis,1615254874,dect60,1,1,True,2021-03-08


In [183]:
securityanalysissubmissions.isnull().sum()

title            0
selftext        51
subreddit        0
created_utc      0
author           0
num_comments     0
score            0
is_self          0
timestamp        0
dtype: int64

In [184]:
substituteincolumn(securityanalysissubsmissions, 'selftext')
substituteincolumn(securityanalysissubmissions, 'author')

In [185]:
securityanalysissubmissions.isnull().sum()

title             0
selftext         51
subreddit         0
created_utc       0
author          461
num_comments      0
score             0
is_self           0
timestamp         0
dtype: int64

In [188]:
securityanalysissubmissions.dropna(inplace=True)

In [189]:
# Create new column called clean text with clean_strings function applied to selftext
securityanalysissubmissions['cleantext'] = clean_strings(securityanalysissubmissions['selftext'], stopwords=combined_words)

In [190]:
# Add column with length of selftext
securityanalysissubmissions['selftextwordcount'] = securityanalysissubmissions['selftext'].str.split().str.len()

In [None]:
# Consider Dropping Selftext before exporting to avoid issues

In [257]:
securityanalysissubmissions.isnull().sum()

title                0
selftext             0
subreddit            0
author               0
num_comments         0
score                0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [258]:
securityanalysissubmissions.dtypes

title                object
selftext             object
subreddit            object
author               object
num_comments          int64
score                 int64
timestamp            object
cleantext            object
selftextwordcount     int64
dtype: object

In [288]:
securityanalysissubsmissions.isnull().sum()

title              0
selftext        1480
subreddit          0
created_utc        0
author             0
num_comments       0
score              0
is_self            0
timestamp          0
dtype: int64

In [287]:
securityanalysissubmissions.isnull().sum()

title                0
selftext             0
subreddit            0
author               0
num_comments         0
score                0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [None]:
# securityanalysissubmissions.drop(columns=['created_utc','is_self'], inplace=True)

In [211]:
# Save data set to csv in data folder
securityanalysissubmissions.to_csv('./data/securityanalysissubmissions.csv', index = False)

# Merging 4 Datasets
- investing submissions
- GME submissions
- wallstreetbets submissions
- SecurityAnalysis submissions

In [264]:
investing=pd.read_csv('./data/Submissions/investingallsubmissions.csv', lineterminator='\n') #dtype='object')
investing.shape

(146053, 11)

In [238]:
gmesubmissions=pd.read_csv('./data/Submissions/GMEallsubmissions.csv')
gmesubmissions.shape

(1130, 11)

In [247]:
wallstreetbets=pd.read_csv('./data/Submissions/wsballsubmissions.csv', lineterminator='\n')
wallstreetbets.shape

(28814, 11)

In [267]:
securityanalysis=pd.read_csv('./data/Submissions/securityanalysissubmissions.csv', lineterminator='\n')
securityanalysis.shape

(8011, 9)

In [263]:
wsbsubmissions.isnull().sum()

title                0
selftext             0
subreddit            0
created_utc          0
author               0
num_comments         0
score                0
is_self              0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [261]:
securityanalysissubmissions.isnull().sum()

title                0
selftext             0
subreddit            0
author               0
num_comments         0
score                0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [290]:
investingsubsmissions.isnull().sum()

title                0
selftext             0
subreddit            0
author               0
num_comments         0
score                0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [272]:
GMEsubmissions.isnull().sum()

title                0
selftext             0
subreddit            0
created_utc          0
author               0
num_comments         0
score                0
is_self              0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [292]:
# Check for compatibility in dimensions of data sets
investingsubsmissions.shape, GMEsubmissions.shape, wsbsubmissions.shape, securityanalysissubmissions.shape,

((146053, 9), (1130, 9), (28814, 9), (8011, 9))

In [277]:
investingsubsmissions.drop(columns=['created_utc','is_self'], inplace=True)
GMEsubmissions.drop(columns=['created_utc','is_self'], inplace=True)
wsbsubmissions.drop(columns=['created_utc','is_self'], inplace=True)

In [284]:
# Check for compatibility in dimensions of data sets
investingsubsmissions.shape, GMEsubmissions.shape, wsbsubmissions.shape, securityanalysissubsmissions.shape

((146053, 9), (1130, 9), (28814, 9), (7546, 9))

In [280]:
investingsubsmissions.head(0)

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,cleantext,selftextwordcount


In [281]:
GMEsubmissions.head(0)

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,cleantext,selftextwordcount


In [282]:
wsbsubmissions.head(0)

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,cleantext,selftextwordcount


In [294]:
securityanalysissubmissions.head(0)

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,cleantext,selftextwordcount


In [422]:
sub = pd.concat([investingsubsmissions,
                         GMEsubmissions,
                         wsbsubmissions,
                         securityanalysissubmissions], axis=0)

In [433]:
sub.reset_index(inplace=True, drop=True)

In [424]:
investingsubsmissions.shape, GMEsubmissions.shape, wsbsubmissions.shape, securityanalysissubmissions.shape

((146053, 9), (1130, 9), (28814, 9), (8011, 9))

In [434]:
sub.shape

(184008, 9)

In [435]:
sub.head(1)

Unnamed: 0,title,selftext,subreddit,author,num_comments,score,timestamp,cleantext,selftextwordcount
0,Vanguard's poor ESG track record,I know there's a huge consensus for vanguard f...,investing,Dazen91,15,1,2021-03-14,know theres huge consensus vanguard funds low ...,129


In [436]:
type(sub)

pandas.core.frame.DataFrame

In [437]:
sub.loc[1,'cleantext']

'question advice personal situation questions include relevant information following old country live employedmaking income objectives money buy house retirement savings time horizon need money month yrs risk tolerance mind risking blackjack need know safe current holdings exposure specific funds sectors assets big debts include rate expenses relevant financial information useful proper answer consider consulting faq wwwredditcomrinvestingwikifaq barwwwredditcomrinvestingaboutsidebar useful resources aware answers opinions redditors used starting point research strongly consider seeing registered financial rep making financial decisions'

In [438]:
sub.isnull().sum()

title                0
selftext             0
subreddit            0
author               0
num_comments         0
score                0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [439]:
sub.dtypes

title                object
selftext             object
subreddit            object
author               object
num_comments          int64
score                 int64
timestamp            object
cleantext            object
selftextwordcount     int64
dtype: object

In [440]:
sub['title'].astype(object)               
sub['selftext'].astype(object)           
sub['subreddit'].astype(object)        
sub['author'].astype(object)
sub['num_comments'].astype(int)
sub['score'].astype(int)
sub['timestamp'] = pd.to_datetime(sub['timestamp'])
sub['cleantext'].astype(object)
sub['selftextwordcount'].astype(int)

sub.dtypes

title                        object
selftext                     object
subreddit                    object
author                       object
num_comments                  int64
score                         int64
timestamp            datetime64[ns]
cleantext                    object
selftextwordcount             int64
dtype: object

In [441]:
sub['subreddit'].value_counts()

investing           146053
wallstreetbets       28814
SecurityAnalysis      8011
GME                   1130
Name: subreddit, dtype: int64

In [None]:
submissions.drop(columns=['selftext'],inplace=True)

In [394]:
submissions.isnull().sum()

title                0
subreddit            0
author               0
num_comments         0
score                0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [443]:
# Save data set to csv in data folder
sub.to_csv('./data/submitwithselftext.csv', index = False)

In [398]:
# Save data set to csv in data folder
submissions.to_csv('./data/submissions.csv', index = False)

In [399]:
submits = pd.read_csv('./data/submissions.csv', lineterminator='\n')

In [400]:
submits.isnull().sum()

title                    0
subreddit                0
author                   0
num_comments             0
score                    0
timestamp                0
cleantext            18336
selftextwordcount        0
dtype: int64

In [401]:
submits.dropna(inplace=True)

In [402]:
submits.isnull().sum()

title                0
subreddit            0
author               0
num_comments         0
score                0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [403]:
submits.to_csv('./data/submits.csv', index = False)

In [404]:
submit=pd.read_csv('./data/submits.csv')

In [405]:
submit.isnull().sum()

title                0
subreddit            0
author               0
num_comments         0
score                0
timestamp            0
cleantext            0
selftextwordcount    0
dtype: int64

In [406]:
submit.shape

(165672, 8)

In [407]:
submissions['subreddit'].value_counts()

investing           146053
wallstreetbets       28814
SecurityAnalysis      8011
GME                   1130
Name: subreddit, dtype: int64

In [408]:
submit['subreddit'].value_counts()

investing           131046
wallstreetbets       25906
SecurityAnalysis      7621
GME                   1099
Name: subreddit, dtype: int64

## Issues to Be Addressed: Notated Below

# FURTHER DATA CLEANING + CHECKS

In [409]:
submit.dtypes

title                object
subreddit            object
author               object
num_comments          int64
score                 int64
timestamp            object
cleantext            object
selftextwordcount     int64
dtype: object

In [410]:
submit.describe()

Unnamed: 0,num_comments,score,selftextwordcount
count,165672.0,165672.0,165672.0
mean,24.722584,14.917789,102.246487
std,183.808128,143.135035,183.250636
min,0.0,0.0,1.0
25%,3.0,1.0,30.0
50%,7.0,1.0,60.0
75%,17.0,5.0,113.0
max,35566.0,27161.0,8403.0


In [412]:
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165672 entries, 0 to 165671
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   title              165672 non-null  object
 1   subreddit          165672 non-null  object
 2   author             165672 non-null  object
 3   num_comments       165672 non-null  int64 
 4   score              165672 non-null  int64 
 5   timestamp          165672 non-null  object
 6   cleantext          165672 non-null  object
 7   selftextwordcount  165672 non-null  int64 
dtypes: int64(3), object(5)
memory usage: 10.1+ MB


In [417]:
# Select documents where Title or Selftext contain GME or gme or GameStop 
(submit['cleantext'].str.contains('GME') | submit['cleantext'].str.contains('gme')).mean()

0.010653580568834807

In [421]:
# Consider using tesla / I don't see enough data here
(submit['cleantext'].str.contains('Tesla') | submit['cleantext'].str.contains('tsla')).mean()

0.010152590661065237