### Intro

This notebook is where I learned to pull in posts from the boardgames subreddit! 

In [2]:
import re
import requests
import pandas as pd
import matplotlib.pyplot as plt
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [6]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [7]:
params = {
'subreddit': 'DMAcademy',
'size': 100,
'before':int(time.time())}

res = requests.get(url, params)
data = res.json()
posts = data['data']
df = pd.DataFrame(posts)

In [8]:
res.status_code

200

In [11]:
def get_posts(iters, subreddit, current_time):
    
    df = pd.DataFrame()
    master_df = pd.DataFrame()
    
    for i in range(iters):
        params = {
        'subreddit': subreddit,
        'size': 100,
        'before':current_time}
        
        res = requests.get(url, params)
        data = res.json()
        posts = data['data']
        df = pd.DataFrame(posts)
        
        frames = [df, master_df]
        master_df = pd.concat(frames, axis=0, ignore_index=True)
        
        current_time = df['created_utc'].min()
        
        print(f'Completed {i+1} iterations, {iters-i-1} iterations remaining')
        time.sleep(60)
        
    return master_df

In [4]:
int(time.time())

1617139501

In [12]:
dnd_df = get_posts(50, 'DMAcademy', int(time.time()))

Completed 1 iterations, 49 iterations remaining
Completed 2 iterations, 48 iterations remaining
Completed 3 iterations, 47 iterations remaining
Completed 4 iterations, 46 iterations remaining
Completed 5 iterations, 45 iterations remaining
Completed 6 iterations, 44 iterations remaining
Completed 7 iterations, 43 iterations remaining
Completed 8 iterations, 42 iterations remaining
Completed 9 iterations, 41 iterations remaining
Completed 10 iterations, 40 iterations remaining
Completed 11 iterations, 39 iterations remaining
Completed 12 iterations, 38 iterations remaining
Completed 13 iterations, 37 iterations remaining
Completed 14 iterations, 36 iterations remaining
Completed 15 iterations, 35 iterations remaining
Completed 16 iterations, 34 iterations remaining
Completed 17 iterations, 33 iterations remaining
Completed 18 iterations, 32 iterations remaining
Completed 19 iterations, 31 iterations remaining
Completed 20 iterations, 30 iterations remaining
Completed 21 iterations, 29 i

In [13]:
dnd_df

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,link_flair_css_class,post_hint,preview,author_flair_background_color,author_flair_text_color,removed_by_category,gilded,author_cakeday,suggested_sort,banned_by
0,[],False,Vashael,,[],,text,t2_7p5eovfe,False,False,...,,,,,,,,,,
1,[],False,Atarihero76,,[],,text,t2_48oeurxu,False,False,...,Guide,self,"{'enabled': False, 'images': [{'id': '9XSNOgfA...",,,,,,,
2,[],False,JethroBuldean,,[],,text,t2_1plx2r4z,False,False,...,,,,,,,,,,
3,[],False,Mechaaniac,,[],,text,t2_a04c8tpe,False,False,...,,,,,,,,,,
4,[],False,Hungerforhuman,,[],,text,t2_a2ouksrx,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,[],False,AngelsJos,,[],,text,t2_9571c28u,False,True,...,,,,,,,,,,
4996,[],False,Light_of_Avalon,,[],,text,t2_871qw,False,False,...,,,,,,,,,,
4997,[],False,Randoff-Runemaker,,[],,text,t2_ab1yxy0e,False,False,...,,,,,,,,,,
4998,[],False,Shatyel,,[],,text,t2_1v4sccw1,False,False,...,,,,,,,,,,


In [14]:
dnd_df['full_link'].nunique()

5000

In [15]:
dnd_df.to_csv('./data/dmacademy.csv', index=False)

In [16]:
zelda_df = get_posts(50, 'truezelda', int(time.time()))

Completed 1 iterations, 49 iterations remaining
Completed 2 iterations, 48 iterations remaining
Completed 3 iterations, 47 iterations remaining
Completed 4 iterations, 46 iterations remaining
Completed 5 iterations, 45 iterations remaining
Completed 6 iterations, 44 iterations remaining
Completed 7 iterations, 43 iterations remaining
Completed 8 iterations, 42 iterations remaining
Completed 9 iterations, 41 iterations remaining
Completed 10 iterations, 40 iterations remaining
Completed 11 iterations, 39 iterations remaining
Completed 12 iterations, 38 iterations remaining
Completed 13 iterations, 37 iterations remaining
Completed 14 iterations, 36 iterations remaining
Completed 15 iterations, 35 iterations remaining
Completed 16 iterations, 34 iterations remaining
Completed 17 iterations, 33 iterations remaining
Completed 18 iterations, 32 iterations remaining
Completed 19 iterations, 31 iterations remaining
Completed 20 iterations, 30 iterations remaining
Completed 21 iterations, 29 i

In [17]:
zelda_df

Unnamed: 0,author,author_flair_css_class,author_flair_text,brand_safe,can_mod_post,contest_mode,created_utc,domain,full_link,id,...,updated_utc,steward_reports,og_description,og_title,removed_by_category,removed_by,url_overridden_by_dest,link_flair_template_id,link_flair_text,poll_data
0,xxx_mlgnoscope_xxx,,,True,False,False,1510504803,self.truezelda,https://www.reddit.com/r/truezelda/comments/7c...,7cge06,...,,,,,,,,,,
1,Ender_Skywalker,,,True,False,False,1510444647,self.truezelda,https://www.reddit.com/r/truezelda/comments/7c...,7cbzex,...,,,,,,,,,,
2,[deleted],,,True,False,False,1510444560,self.truezelda,https://www.reddit.com/r/truezelda/comments/7c...,7cbz3u,...,,,,,,,,,,
3,bluestreakxp,,,True,False,False,1510275857,self.truezelda,https://www.reddit.com/r/truezelda/comments/7b...,7bxsop,...,,,,,,,,,,
4,Kholdstare93,,,True,False,False,1510270811,self.truezelda,https://www.reddit.com/r/truezelda/comments/7b...,7bxbp1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4991,lutyrannus,,,,False,False,1614809046,self.truezelda,https://www.reddit.com/r/truezelda/comments/lx...,lx5kjs,...,,,,,,,,64e0e3ac-d29d-11ea-a357-0e5060aaa7f3,Open Discussion,
4992,novacav,,,,False,False,1614794221,self.truezelda,https://www.reddit.com/r/truezelda/comments/lx...,lx01je,...,,,,,,,,64e0e3ac-d29d-11ea-a357-0e5060aaa7f3,Open Discussion,
4993,[deleted],,,,False,False,1614789322,self.truezelda,https://www.reddit.com/r/truezelda/comments/lw...,lwy2zq,...,,,,,deleted,,,7cfab9de-d5b0-11ea-86b5-0e8cdba62949,Question,
4994,yuckygross,,,,False,False,1614737483,self.truezelda,https://www.reddit.com/r/truezelda/comments/lw...,lwinf0,...,,,,,,,,7cfab9de-d5b0-11ea-86b5-0e8cdba62949,Question,


In [None]:
# for col in zelda_df.columns:
#     if col not in dnd_df.columns:
#         print(col)

In [20]:
zelda_df['full_link'].nunique()

4996

In [19]:
zelda_df.to_csv('./data/truezelda.csv', index=False)

### Pick up here!!!

So far we've accomplished:
* getting data - 200 subreddit posts so far (not comments)
* count vectorizing the subreddit posts
* passing data to TWO MODELS
    * Bernoulli Naive Bayes model : when we have 0/1 variables.
    * TFIDF multinomial naive bayes : when our variables are positive integers

To do next:
* get MORE DATA - source for pulling data on time delay: https://gist.github.com/tecoholic/1242694
* get different types of data - try comments! try using titles alongside selftext! 

In [4]:
url = 'https://api.pushshift.io/reddit/search/comment'

In [27]:
poli_2010_df = get_posts(10, 'politics', 1269814721)

In [28]:
poli_2020_df = get_posts(10, 'politics', 1585433921)

In [31]:
poli_2010_df['year'] = '2010'
poli_2020_df['year'] = '2020'

In [34]:
poli_2010_df.to_csv('./data/poli_2010.csv', index=False)
poli_2020_df.to_csv('./data/poli_2020.csv', index=False)

### Political discussion subreddit

In [35]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [38]:
poli_dis_2012_df = get_posts(49, 'PoliticalDiscussion', 1333157554)

Completed 1 iterations, 48 iterations remaining
Completed 2 iterations, 47 iterations remaining
Completed 3 iterations, 46 iterations remaining
Completed 4 iterations, 45 iterations remaining
Completed 5 iterations, 44 iterations remaining
Completed 6 iterations, 43 iterations remaining
Completed 7 iterations, 42 iterations remaining
Completed 8 iterations, 41 iterations remaining
Completed 9 iterations, 40 iterations remaining
Completed 10 iterations, 39 iterations remaining
Completed 11 iterations, 38 iterations remaining
Completed 12 iterations, 37 iterations remaining
Completed 13 iterations, 36 iterations remaining
Completed 14 iterations, 35 iterations remaining
Completed 15 iterations, 34 iterations remaining
Completed 16 iterations, 33 iterations remaining
Completed 17 iterations, 32 iterations remaining
Completed 18 iterations, 31 iterations remaining
Completed 19 iterations, 30 iterations remaining
Completed 20 iterations, 29 iterations remaining
Completed 21 iterations, 28 i

In [31]:
poli_dis_2020_df = get_posts(50, 'PoliticalDiscussion', 1585433921)

Completed 1 iterations, 49 iterations remaining
Completed 2 iterations, 48 iterations remaining
Completed 3 iterations, 47 iterations remaining
Completed 4 iterations, 46 iterations remaining
Completed 5 iterations, 45 iterations remaining
Completed 6 iterations, 44 iterations remaining
Completed 7 iterations, 43 iterations remaining
Completed 8 iterations, 42 iterations remaining
Completed 9 iterations, 41 iterations remaining
Completed 10 iterations, 40 iterations remaining
Completed 11 iterations, 39 iterations remaining
Completed 12 iterations, 38 iterations remaining
Completed 13 iterations, 37 iterations remaining
Completed 14 iterations, 36 iterations remaining
Completed 15 iterations, 35 iterations remaining
Completed 16 iterations, 34 iterations remaining
Completed 17 iterations, 33 iterations remaining
Completed 18 iterations, 32 iterations remaining
Completed 19 iterations, 31 iterations remaining
Completed 20 iterations, 30 iterations remaining
Completed 21 iterations, 29 i

In [39]:
poli_dis_2012_df['year'] = '2012'
#poli_dis_2020_df['year'] = '2020'

In [40]:
poli_dis_2012_df.to_csv('./data/poli_dis_2012.csv', index=False)
#poli_dis_2020_df.to_csv('./data/poli_dis_2020.csv', index=False)

In [42]:
poli_dis_2012_df['full_link'].nunique()

4882