<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">  
# Project 3: Reddit NLP Classification

Project notebook organisation:<br>
**1 - Data Collection & Gathering** (current notebook)<br>
[2 - Data Cleaning & EDA](./Project_3.2_Data_Cleaning_and_EDA.ipynb)<br>
[3 - Modelling and Evaluation](./Project_3.3_Modelling_and_evaluation.ipynb)<br>

## Part 1: Data Collection & Gathering

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import requests
import datetime

pd.set_option('display.max_columns', 100)

In [2]:
# Define the base url
baseurl = 'https://api.pushshift.io/reddit/search/submission'

## Create functions to collect the data

In [3]:
#define parameters from subreddit
def get_params(base_df, subreddit):
    params = {
        'subreddit': subreddit,
        'size': 100,
        'before': base_df.loc[(base_df.shape[0] - 1), 'created_utc']

 
    }
    return params


In [4]:
#define function that returns list of dictionary for the content of each subreddit post
def get_posts(params, baseurl = 'https://api.pushshift.io/reddit/search/submission'):
    res = requests.get(baseurl, params)
    if res.status_code != 200:
        print('Error:', res.status_code)
    else:
        data = res.json()
        posts = data['data']
    return posts

In [5]:
#Define function that turn list of post into DataFrame
def create_df(posts):
    return pd.DataFrame(posts)

In [6]:
#define function that updates base DataFrame with 100 posts
def update_df(base_df, subreddit):
    params = get_params(base_df, subreddit)
    posts = get_posts(params, baseurl)
    df2 = create_df(posts)
    updated = pd.concat([base_df, df2], axis = 0, ignore_index = True, sort = True)
    return updated

## r/wallstreetbets: 100 submissions

In [7]:
#Parameters for first pull from r/wallstreetbets (first 100 posts)
params_wsb = {
    'subreddit': 'wallstreetbets',
    'size' : 100
}

In [8]:
#Get list of posts
posts_wsb = get_posts(params_wsb)

In [9]:
#Create DataFrame from  the posts
wsb = create_df(posts_wsb)

In [10]:
wsb.shape

(100, 76)

In [11]:
wsb['title']

0     Let’s pick a stock and take it to the moon … w...
1     If you can pick one stock (large cap) to inves...
2     I fixed u/CollegeThrowaway___'s meme. It's mor...
3          Anyone up for a four leaf ($CLOV) ER play? 🍀
4     Anyone else holding AMC LEAP calls and what do...
                            ...                        
95                                                Hyzon
96    How I honestly feel about all you Gay Ass BBBY...
97                                                   💎🙌
98                                                Huh??
99                       Breaking Down The Ape Dividend
Name: title, Length: 100, dtype: object

In [12]:
wsb.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type', 'locked',
       'media_only', 'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'removed_by_category', 'retrieved_on', 'score', 'selftext',
       'send_replies', 'spoiler', 'stickied', 'subreddit', 'subred

In [13]:
wsb.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,author_premium,awarders,can_mod_post,contest_mode,created_utc,domain,full_link,gildings,id,is_created_from_ads_ui,is_crosspostable,is_meta,is_original_content,is_reddit_media_domain,is_robot_indexable,is_self,is_video,link_flair_background_color,link_flair_css_class,link_flair_richtext,link_flair_template_id,link_flair_text,link_flair_text_color,link_flair_type,locked,media_only,no_follow,num_comments,num_crossposts,over_18,parent_whitelist_status,permalink,pinned,pwls,removed_by_category,retrieved_on,score,selftext,send_replies,spoiler,stickied,subreddit,subreddit_id,subreddit_subscribers,subreddit_type,suggested_sort,thumbnail,title,total_awards_received,treatment_tags,upvote_ratio,url,whitelist_status,wls,post_hint,preview,thumbnail_height,thumbnail_width,url_overridden_by_dest,author_flair_background_color,author_flair_text_color,gallery_data,is_gallery,media_metadata,author_cakeday,author_flair_template_id
0,[],False,Sea-Love238,,[],,text,t2_932192jx,False,False,False,[],False,False,1659851243,self.wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{},wi8bb1,False,False,False,False,False,False,True,False,#0392cf,yolo,"[{'e': 'text', 't': 'YOLO'}]",da18a43a-83c5-11e8-9b6c-0e287561ddb8,YOLO,light,richtext,False,False,True,1,0,True,some_ads,/r/wallstreetbets/comments/wi8bb1/lets_pick_a_...,False,7,moderator,1659851254,1,[removed],True,True,False,wallstreetbets,t5_2th52,12392343,public,confidence,nsfw,Let’s pick a stock and take it to the moon … w...,0,[],1.0,https://www.reddit.com/r/wallstreetbets/commen...,promo_adult_nsfw,3,,,,,,,,,,,,
1,[],False,creepilincolnbot,,[],,text,t2_1xd07zma,False,False,False,[],False,False,1659851241,self.wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{},wi8ba8,False,False,False,False,False,False,True,False,#800080,question,"[{'e': 'text', 't': 'Discussion'}]",96f6c79e-b853-11e5-a4cb-0ebdf030e05d,Discussion,light,richtext,False,False,True,1,0,False,some_ads,/r/wallstreetbets/comments/wi8ba8/if_you_can_p...,False,7,moderator,1659851251,1,[removed],True,False,False,wallstreetbets,t5_2th52,12392343,public,confidence,self,If you can pick one stock (large cap) to inves...,0,[],1.0,https://www.reddit.com/r/wallstreetbets/commen...,some_ads,7,,,,,,,,,,,,
2,[],False,visc0siity,,[],,text,t2_wqh23,False,False,False,[],False,False,1659851146,i.redd.it,https://www.reddit.com/r/wallstreetbets/commen...,{},wi8adn,False,True,False,True,True,True,False,False,#014980,meme,"[{'e': 'text', 't': 'Meme'}]",0513bea8-4f64-11e9-886d-0e2b4fe7300c,Meme,light,richtext,False,False,True,0,0,False,some_ads,/r/wallstreetbets/comments/wi8adn/i_fixed_ucol...,False,7,,1659851157,1,,True,False,False,wallstreetbets,t5_2th52,12392339,public,confidence,https://b.thumbs.redditmedia.com/Vpg9C5wgzm7Vu...,I fixed u/CollegeThrowaway___'s meme. It's mor...,0,[],1.0,https://i.redd.it/en0twti0c8g91.png,some_ads,7,image,"{'enabled': True, 'images': [{'id': 'H0agleacY...",85.0,140.0,https://i.redd.it/en0twti0c8g91.png,,,,,,,
3,[],False,HomelessZillionaire,,[],,text,t2_cq4lklxv,False,False,True,[],False,False,1659850990,reddit.com,https://www.reddit.com/r/wallstreetbets/commen...,{},wi88vl,False,True,False,False,False,True,False,False,#ff4500,chart,"[{'e': 'text', 't': 'Chart'}]",e7bce9ea-4b49-11eb-ad96-0e210774c543,Chart,light,richtext,False,False,True,0,0,False,some_ads,/r/wallstreetbets/comments/wi88vl/anyone_up_fo...,False,7,,1659851001,1,,True,False,False,wallstreetbets,t5_2th52,12392339,public,confidence,https://a.thumbs.redditmedia.com/VQoPKmKO2mkWg...,Anyone up for a four leaf ($CLOV) ER play? 🍀,0,[],1.0,https://event.on24.com/wcc/r/3849203/1384B185E...,some_ads,7,,,64.0,140.0,https://www.reddit.com/gallery/wi88vl,,,{'items': [{'caption': 'Broke out of various d...,True,"{'e5kv4r3ob8g91': {'e': 'Image', 'id': 'e5kv4r...",,
4,[],False,mattpho,,[],,text,t2_4syvimp,False,False,False,[],False,False,1659850701,self.wallstreetbets,https://www.reddit.com/r/wallstreetbets/commen...,{},wi85yc,False,True,False,False,False,True,True,False,#800080,question,"[{'e': 'text', 't': 'Discussion'}]",96f6c79e-b853-11e5-a4cb-0ebdf030e05d,Discussion,light,richtext,False,False,True,0,0,False,some_ads,/r/wallstreetbets/comments/wi85yc/anyone_else_...,False,7,,1659850712,1,"Hi WSB guru,\n\nI tried asking this question i...",True,False,False,wallstreetbets,t5_2th52,12392329,public,confidence,self,Anyone else holding AMC LEAP calls and what do...,0,[],1.0,https://www.reddit.com/r/wallstreetbets/commen...,some_ads,7,,,,,,,,,,,,


## r/stocks: 100 submissions

In [14]:
#Parameters for first pull from investing (first 100 posts)
params_stocks = {
    'subreddit': 'stocks',
    'size' : 100
}

In [15]:
#Get list of posts
posts_stocks = get_posts(params_stocks)

In [16]:
#Create DataFrame from  the posts
stocks = create_df(posts_stocks)

In [17]:
stocks['title']

0                    I have 40k, where should I put it?
1                               Sellout strategy. Help!
2     What stocks are best to invest in short term a...
3                                  P/E Misunderstanding
4     Do you think Robinhood becoming available in o...
                            ...                        
95    Jamie "Hurricane" Dimon of Chase reversed posi...
96               Betting against Chinese housing market
97                     $INTZ follow the bread crumbs DD
98                             How Diversified Are You?
99                                CHRS is a rocket ship
Name: title, Length: 100, dtype: object

## Collect 1,900 more posts per subreddit

In [18]:
1900/100

19.0

In [19]:
#Update WSB dataframe with 1900 more posts
for i in range(19):
    wsb = update_df(wsb, 'wallstreetbets')

In [20]:
wsb.shape

(2000, 82)

In [21]:
wsb.columns

Index(['all_awardings', 'allow_live_comments', 'author', 'author_cakeday',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_is_blocked', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gallery_data', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_gallery', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_css_class', 'link_flair_richtext', 'link_flair_template_id',
       'link_flair_text', 'link_flair_text_color', 'link_flair_type',
       'live_audio', 'locked', 'media', 'media_embed', 'media_metadata',
       'media_only', 'no_follow', 'num_comments', 'num_crosspost

In [22]:
#Update Stocks dataframe with 1900 more posts
for i in range(19):
    stocks = update_df(stocks, 'stocks')

In [23]:
stocks.shape

(2000, 70)

In [24]:
stocks.columns

Index(['all_awardings', 'allow_live_comments', 'author', 'author_cakeday',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_text', 'author_flair_text_color',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'banned_by',
       'can_mod_post', 'contest_mode', 'created_utc', 'domain', 'full_link',
       'gildings', 'id', 'is_created_from_ads_ui', 'is_crosspostable',
       'is_meta', 'is_original_content', 'is_reddit_media_domain',
       'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwls

In [25]:
#Convert Epoch time to datetime
wsb['created_date'] = wsb['created_utc'].map(lambda x: datetime.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d')) 
# Drop created_utc column
wsb.drop(columns=['created_utc'], axis=1, inplace=True)

In [26]:
#Convert Epoch time to datetime
stocks['created_date'] = stocks['created_utc'].map(lambda x: datetime.datetime.fromtimestamp(int(x)).strftime('%Y-%m-%d')) 
# Drop created_utc column
stocks.drop(columns=['created_utc'], axis=1, inplace=True)

In [27]:
#Keep the following columns
cols = ['subreddit','selftext','title','created_date']

wsb = wsb[cols]
stocks = stocks[cols]

In [28]:
#Export Wallstreetbets submissions dataframe to CSV
wsb.to_csv('./data/wallstreetbets.csv', index = False)

In [29]:
#Export Stocks submissions dataframe to CSV
stocks.to_csv('./data/stocks.csv', index = False)