In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt
import time
import requests

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from transformers import pipeline

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, plot_confusion_matrix



In [3]:
# cleaning strings
def clean_strings(sentences, stopwords = []):
    import re
    # output sentences
    output = []
    # lowercasing all
    sentences = [st.lower() for st in sentences]
    stopwords = [st.lower() for st in stopwords]
    # URLS thank you Ωmega on stackoverflow
    sentences = [re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) for text in sentences]
    # newlines and tabs
    sentences = [st.replace('\n', ' ').replace('\t', ' ') for st in sentences]
    # digits and punctuation AND stopwords
    for st in sentences:
        new_st = ''.join([char for char in st if char.isalpha() or char == ' '])
        new_st = ' '.join([word for word in new_st.split() if word not in stopwords])
        output.append(new_st)
    return output
    

In [112]:
# helper function for query_pushshift
def post_grabber(stem, current_day_window): 
    # implement for loop with `time.sleep(2)`    
    URL = "{}&after={}d".format(stem, current_day_window)
    print("Querying from: " + URL)
    response = requests.get(URL)
    if response.status_code != 200:
        print(response)
    assert response.status_code == 200
    mine = response.json()['data']
    df = pd.DataFrame.from_dict(mine)

    # removes posts without selftext
    df = df[df['selftext'] != '[removed]']
    # removes AutoMod posts
    df = df[df['author'] != 'AutoModerator']
    # removes deleted posts
    df = df[df['selftext'] != '[deleted]']
    # drops rows with NaN values
    df = df[pd.isna(df['selftext']) != True]
    
    return df

In [113]:
# scrapes until desired conditions
def query_pushshift(subreddit, kind = 'submission', day_window = 30, n = 5, grab_size= 100, target = 100):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size={grab_size}" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # daywindow tracker
    current_day_window = 0
    
    for i in range(1, n + 1):
        current_day_window += day_window
        df = post_grabber(stem, current_day_window)
        
        posts.append(df)
        time.sleep(10)
            
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]        
    
    # check df length and keep adding posts until the length is bigger than target
    while full.shape[0] < target:
        current_day_window += day_window
        df = post_grabber(stem, current_day_window)
        
        full = pd.concat([full, df], sort=False) 
        
        
        
        # if submission
        if kind == "submission":
            # select desired columns
            full = full[SUBFIELDS]
            # drop duplicates
            full.drop_duplicates(inplace = True)
            # select `is_self` == True
            full = full.loc[full['is_self'] == True] 
            
        print(full.shape[0])
        time.sleep(10)
    
    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print(full.shape[0])
    
    print("Query Complete!")    
    return full 

In [115]:
nosleep = query_pushshift('nosleep', 'submission', 3, 5, 100, 10000)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=3d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=6d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=9d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=12d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=18d
483
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=21d
565
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=24d
642
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=27d
719
Querying from: https://api.pushshift.io/reddit/search/submission?subr

5089
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=234d
5157
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=237d
5234
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=240d
5299
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=243d
5376
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=246d
5449
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=249d
5526
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=252d
5597
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=255d
5667
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=nosleep&size=100&after=258d
5712
Querying from: https://

In [123]:
nosleep.to_csv('./data/nosleep.csv', index = False)

In [119]:
nosleep.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
1,The Leatherman,The Leatherman was local folklore; virtually e...,nosleep,1627608317,TheCrookedBoy,12,1,True,2021-07-29
2,That Thing,"\n\nAs a forensic cleaner, there are many thi...",nosleep,1627609186,Defiant_Initial_6866,1,1,True,2021-07-29
3,Smile,I had just woke up in the middle of the night ...,nosleep,1627610252,SnooDoubts6715,1,1,True,2021-07-29
4,I made a list ranking my neighbours. Kids thes...,"Hello, it's Renee again! I'm really sorry abou...",nosleep,1627610326,may-june-sweet-july,1,1,True,2021-07-29
5,What I did in my past could cost me everything...,Part I linked here:\n\n[https://www.reddit.com...,nosleep,1627610606,thescarlettpriest,1,1,True,2021-07-29


In [133]:
nosleep.shape

(10027, 9)

In [129]:
%%time
tifu = query_pushshift('idontworkherelady', 'submission', 5, 5, 100, 10000)

Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=5d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=10d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=15d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=20d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=25d
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=30d
134
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=35d
158
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=40d
181
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&

2108
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=360d
2150
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=365d
2205
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=370d
2256
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=375d
2302
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=380d
2348
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=385d
2392
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=390d
2440
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=395d
2491
Querying from: https://api.pushshift.io/reddit/sear

5279
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=710d
5340
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=715d
5377
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=720d
5438
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=725d
5488
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=730d
5537
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=735d
5606
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=740d
5673
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=745d
5734
Querying from: https://api.pushshift.io/reddit/sear

9412
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=1060d
9432
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=1065d
9468
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=1070d
9501
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=1075d
9540
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=1080d
9563
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=1085d
9587
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=1090d
9615
Querying from: https://api.pushshift.io/reddit/search/submission?subreddit=idontworkherelady&size=100&after=1095d
9644
Querying from: https://api.pushshift.io/red

In [130]:
tifu.head()

Unnamed: 0,title,selftext,subreddit,created_utc,author,num_comments,score,is_self,timestamp
0,That’s not my job!,Tonight I visited my old stomping grounds. I u...,IDontWorkHereLady,1627450212,No-Temperature-3698,51,1,True,2021-07-27
1,Just because I put stuff back into the shelf d...,The other day I was at our local drugstore. I ...,IDontWorkHereLady,1627460348,IamasimpforObi-Wan,50,1,True,2021-07-28
4,I'm not sure you should be shopping unsupervis...,This happened to me yesterday and is a little ...,IDontWorkHereLady,1627491179,_Eru_Illuvatar_,142,1,True,2021-07-28
5,"You don’t work here, but would you like to?","Many years ago, I worked for Home Depot as a l...",IDontWorkHereLady,1627498780,jmowreader,10,1,True,2021-07-28
6,You must be new in town,I was in the grocery store and had a basket wi...,IDontWorkHereLady,1627519283,betheliquor,73,1,True,2021-07-28


In [132]:
tifu.shape

(10010, 9)

In [131]:
tifu.to_csv('./data/idontworkherelady.csv', index = False)