In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import datetime as dt
import time
import requests

from IPython.core.display import display, HTML

import emoji

# this setting widens how many characters pandas will display in a column:
pd.options.display.max_colwidth = 800


In [17]:
def query_pushshift(subreddit, kind = 'submission', no_of_posts = 100):
    SUBFIELDS = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments', 'score', 'is_self', 
                'is_crosspostable', 'num_crossposts', 'stickied', 'permalink', 'link_flair_text']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem = f"{BASE_URL}?subreddit={subreddit}&size=500&author=![deleted]" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    last = ''
    N = 0

    # implement for loop with `time.sleep(2)`
    while N < no_of_posts:
        URL = "{}&before={}".format(stem, last)
        print(f'Querying from {subreddit}: {URL}')
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        last = df['created_utc'].iloc[-1]
        N += df.shape[0]
        print (f'Total Posts Collected = {N}')
        posts.append(df)
        time.sleep(4)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    print (f'After concat df shape = {full.shape}')
    # if submission
    if kind == "submission":
        # select desired columns
        full = full[SUBFIELDS]



    full.reset_index(drop=True, inplace=True)
    # Append the string to all the permalink entries so that we have a link to the comment
    full['permalink'] = "https://reddit.com" + full['permalink'].astype(str)
    
    
    # Create a function to make the link to be clickable and style the last column
    def make_clickable(val):
        """ Makes a pandas column clickable by wrapping it in some html.
        """
        name = 'Link'
        return f'<a target="_blank" href="{val}">{name}</a>'


    full.style.format({'permalink': make_clickable})   


    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    #print (full.shape)

    print(f'Query Complete for {subreddit}!')    
    return full 

In [2]:
def clean_df(full):
        # drop duplicates
        full.drop_duplicates(inplace = True)
        print (f'After drop duplicates df shape = {full.shape}')
        
        # select `is_self` == True
        full = full.loc[full['is_self'] == True]    
        print (f'After is_self=True df shape = {full.shape}')
        
        # is_Crosspostable = False is for selftext = [removed]
        # delete documents with is_crosspostable=True
        # is_crosspostable = True for all other posts
        full = full.loc[full['is_crosspostable'] == True]
        print (f'After is_crosspostable=True df shape = {full.shape}')

        full = full.loc[full['stickied'] == False]
        print (f'After stickied=True df shape = {full.shape}')

        full = full.loc[full['num_comments'] > 0]
        print (f'After num_comments > 0 df shape = {full.shape}')

        #remove http/https
        http_rule = r'http[s]*\:\/\/([^\/]*)\/(.*)'
        full['title'] = full['title'].apply(lambda x: (re.sub(http_rule, '', x)))
        full['selftext'] = full['selftext'].apply(lambda x: (re.sub(http_rule, '', x)))
        print (f'After removing weblinks df shape = {full.shape}')

        #removing emojis
        full['title'] = full['title'].apply(lambda x : emoji.get_emoji_regexp().sub(u'', x))
        full['selftext'] = full['selftext'].apply(lambda x : emoji.get_emoji_regexp().sub(u'', x))

        #removing ellipsis
        full['title'] = full['title'].apply(lambda x : re.sub(r'[^\x00-\x7f]',r'',x))
        full['selftext'] = full['selftext'].apply(lambda x : re.sub(r'[^\x00-\x7f]',r'',x))


        #full['title'] = full['title'].apply(lambda x : emoji_pattern.sub(r'', x))
        #full['selftext'] = full['selftext'].apply(lambda x : emoji_pattern.sub(r'', x))
        print (f'After removing emojis and other unicode df shape = {full.shape}')

        # removing selftext which is empty
        mask = full[full['selftext'].apply(lambda x : len(str(x))) == 0].index
        full = full.iloc[~full.index.isin(mask)]
        print (f'After removing blank selftext entry rows = {full.shape}')

        return full

In [21]:
subreddit = ['beyondthebump', 'BabyBumps']
posts = []
for sub in subreddit:
    results = query_pushshift(subreddit=sub, kind='submission', no_of_posts = 7500)#, day_window=30, n=2)
    posts.append(results)

df = pd.concat(posts, sort=False)
df.reset_index(drop=True, inplace=True)
df.to_csv('../predictions/df_raw.csv')
print(f'Full dataframe shape after querying both subreddits is {df.shape}')

Querying from beyondthebump: https://api.pushshift.io/reddit/search/submission?subreddit=beyondthebump&size=500&author=![deleted]&before=
Total Posts Collected = 100
Querying from beyondthebump: https://api.pushshift.io/reddit/search/submission?subreddit=beyondthebump&size=500&author=![deleted]&before=1631868926
Total Posts Collected = 200
Querying from beyondthebump: https://api.pushshift.io/reddit/search/submission?subreddit=beyondthebump&size=500&author=![deleted]&before=1631751152
Total Posts Collected = 300
Querying from beyondthebump: https://api.pushshift.io/reddit/search/submission?subreddit=beyondthebump&size=500&author=![deleted]&before=1631648849
Total Posts Collected = 400
Querying from beyondthebump: https://api.pushshift.io/reddit/search/submission?subreddit=beyondthebump&size=500&author=![deleted]&before=1631550911
Total Posts Collected = 500
Querying from beyondthebump: https://api.pushshift.io/reddit/search/submission?subreddit=beyondthebump&size=500&author=![deleted]&

In [116]:
df = clean_df(df)
df.reset_index(drop=True, inplace=True)
df.to_csv('../data/df.csv')


After drop duplicates df shape = (10603, 15)
After is_self=True df shape = (10603, 15)
After is_crosspostable=True df shape = (10603, 15)
After stickied=True df shape = (10603, 15)
After num_comments > 0 df shape = (10603, 15)
After removing weblinks df shape = (10603, 15)
After removing emojis and other unicode df shape = (10603, 15)
After removing blank selftext entry rows = (10603, 15)


In [48]:
df.isnull().sum()

title                  0
selftext               0
subreddit              0
created_utc            0
author                 0
num_comments           0
score                  0
is_self                0
is_crosspostable       0
num_crossposts         0
stickied               0
permalink              0
link_flair_text     1605
timestamp              0
dtype: int64

In [2]:
df = pd.read_csv('../data/df.csv', index_col=0)
df.reset_index(drop=True, inplace=True)


In [3]:
df.isnull().sum()

title                  0
selftext               0
subreddit              0
created_utc            0
author                 0
num_comments           0
score                  0
is_self                0
is_crosspostable       0
num_crossposts         0
stickied               0
permalink              0
link_flair_text     1604
timestamp              0
titleselftext          0
dtype: int64

In [5]:
df.shape

(10603, 15)