### Intro

This notebook is where I learned to pull in posts from the boardgames subreddit! 

In [1]:
import re
import requests
import pandas as pd
import matplotlib.pyplot as plt
import time

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [37]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [26]:
def get_posts(iters, subreddit, current_time):
    
    df = pd.DataFrame()
    master_df = pd.DataFrame()
    
    for i in range(iters):
        params = {
        'subreddit': subreddit,
        'size': 100,
        'before':current_time}

        res = requests.get(url, params)
        data = res.json()
        posts = data['data']
        df = pd.DataFrame(posts)

        frames = [df, master_df]
        master_df = pd.concat(frames, axis=0, ignore_index=True)

        current_time = df['created_utc'].min()

        time.sleep(60)
    return master_df

In [16]:
ocean_df = get_posts(2, 'oceanography', int(time.time()))

In [17]:
ocean_df

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,permalink,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,top_awarded_type,total_awards_received,treatment_tags
0,[],,Lost_Piece_1911,,,[],,,,text,...,/r/oceanography/comments/m239f6/is_there_no_mi...,1615402125,2,True,False,oceanography,t5_2rez7,,0,[]
1,[],,JoaSnick,,,[],,,,text,...,/r/oceanography/comments/m1zq80/can_anyone_exp...,1615395344,1,True,False,oceanography,t5_2rez7,,0,[]
2,[],,RoundAlternative1106,,,[],,,,text,...,/r/oceanography/comments/m1zq80/can_anyone_exp...,1615394345,1,True,False,oceanography,t5_2rez7,,0,[]
3,[],,JoaSnick,,,[],,,,text,...,/r/oceanography/comments/m1zq80/can_anyone_exp...,1615393093,1,True,False,oceanography,t5_2rez7,,0,[]
4,[],,chuckles11,,,[],,,,text,...,/r/oceanography/comments/m0gkp0/ocean_upwellin...,1615300285,1,True,False,oceanography,t5_2rez7,,0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],,learningeverythings,,,[],,,,text,...,/r/oceanography/comments/m239f6/is_there_no_mi...,1615440525,1,True,False,oceanography,t5_2rez7,,0,[]
96,[],,doughnuts58008,,,[],,,,text,...,/r/oceanography/comments/m2eqxo/two_hightech_d...,1615439310,1,True,False,oceanography,t5_2rez7,,0,[]
97,[],,Lost_Piece_1911,,,[],,,,text,...,/r/oceanography/comments/m239f6/is_there_no_mi...,1615438537,1,True,False,oceanography,t5_2rez7,,0,[]
98,[],,BlankVerse,,,[],,,,text,...,/r/oceanography/comments/m2eqxo/two_hightech_d...,1615433057,1,True,False,oceanography,t5_2rez7,,0,[]


In [None]:
ocean_df['full_link'].nunique()

In [24]:
ocean_df.to_csv('./data/oceanography.csv', index=False)

In [19]:
game_df = get_posts(2, 'boardgames', int(time.time()))

In [20]:
game_df

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,top_awarded_type,total_awards_received,treatment_tags,distinguished
0,[],,TurboCooler,,,[],,,,text,...,1616977825,1,True,False,boardgames,t5_2qmjp,,0,[],
1,[],,zehlewe,,,[],,,,text,...,1616977764,1,True,False,boardgames,t5_2qmjp,,0,[],
2,[],,Breakfast-Surreal,,,[],,,,text,...,1616977741,1,True,False,boardgames,t5_2qmjp,,0,[],
3,[],,svanxx,,descent,"[{'e': 'text', 't': 'Descent'}]",,Descent,dark,richtext,...,1616977543,1,True,False,boardgames,t5_2qmjp,,0,[],
4,[],,Spirit_Horseman,,,[],,,,text,...,1616977448,1,True,False,boardgames,t5_2qmjp,,0,[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],,RedMonsterPowerBang,,,[],,,,text,...,1616978401,1,True,False,boardgames,t5_2qmjp,,0,[],
96,[],,Neembaf,,,[],,,,text,...,1616978253,1,True,False,boardgames,t5_2qmjp,,0,[],
97,[],,TurboCooler,,,[],,,,text,...,1616977980,1,True,False,boardgames,t5_2qmjp,,0,[],
98,[],,svanxx,,descent,"[{'e': 'text', 't': 'Descent'}]",,Descent,dark,richtext,...,1616977889,1,True,False,boardgames,t5_2qmjp,,0,[],


In [None]:
for col in game_df.columns:
    if col not in ocean_df.columns:
        print(col)

In [None]:
game_df['full_link'].nunique()

In [25]:
game_df.to_csv('./data/boardgames.csv', index=False)

### Pick up here!!!

So far we've accomplished:
* getting data - 200 subreddit posts so far (not comments)
* count vectorizing the subreddit posts
* passing data to TWO MODELS
    * Bernoulli Naive Bayes model : when we have 0/1 variables.
    * TFIDF multinomial naive bayes : when our variables are positive integers

To do next:
* get MORE DATA - source for pulling data on time delay: https://gist.github.com/tecoholic/1242694
* get different types of data - try comments! try using titles alongside selftext! 

In [4]:
url = 'https://api.pushshift.io/reddit/search/comment'

In [27]:
poli_2010_df = get_posts(10, 'politics', 1269814721)

In [28]:
poli_2020_df = get_posts(10, 'politics', 1585433921)

In [29]:
poli_2010_df['year'] = '2010'

In [30]:
poli_2010_df

Unnamed: 0,author,author_created_utc,author_flair_css_class,author_flair_text,author_fullname,body,controversiality,created_utc,distinguished,gilded,...,parent_id,reply_delay,retrieved_on,score,score_hidden,subreddit,subreddit_id,user_removed,edited,year
0,Powerfury,1.258768e+09,,,t2_3qers,It depends on how Fox News takes this story.,0,1269804789,,0,...,t1_c0n2fae,3787,1426257058,14,False,politics,t5_2cneq,,,2010
1,[deleted],,,,,"Can we just start calling it the ""White Tea"" p...",0,1269804786,,0,...,t3_bjamc,22624,1426257058,0,False,politics,t5_2cneq,,,2010
2,cinnamonandgravy,1.253603e+09,,,t2_3n496,"sarah palin.\n\ndonkey punch, donkey punch, do...",0,1269804780,,0,...,t3_bjdei,5130,1426257058,5,False,politics,t5_2cneq,,,2010
3,Grokkin_it,1.255903e+09,,,t2_3oklb,&gt; The extreme right is truly about feelings...,0,1269804748,,0,...,t1_c0n2j5k,68,1426257058,2,False,politics,t5_2cneq,,,2010
4,APeacefulWarrior,1.255231e+09,,,t2_3o5g9,"&gt;I'd much rather have a less radical, stron...",0,1269804718,,0,...,t1_c0n2h9o,1835,1426257056,2,False,politics,t5_2cneq,,,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,binary_search_tree,1.246097e+09,,,t2_3j076,"This ""Christian group""?\r\n\r\nhttp://www.huta...",0,1269813866,,0,...,t3_bjf9f,2427,1426257183,3,False,politics,t5_2cneq,,,2010
996,alllie,1.187988e+09,,,t2_2ifsl,Global Research does very good research.,0,1269813864,,0,...,t1_c0n2miy,6008,1426257183,1,False,politics,t5_2cneq,,,2010
997,recreational,1.242535e+09,,,t2_3he5n,The people that were bombed/assassinated in th...,0,1269813861,,0,...,t3_bjeg8,7763,1426257183,1,False,politics,t5_2cneq,,,2010
998,Alanna,1.201642e+09,,,t2_32us3,It's not just about race. It's about being on...,0,1269813858,,0,...,t1_c0n2qbi,2503,1426257183,3,False,politics,t5_2cneq,,,2010


In [31]:
poli_2020_df['year'] = '2020'

In [32]:
poli_2020_df

Unnamed: 0,all_awardings,associated_award,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_template_id,author_flair_text,author_flair_text_color,author_flair_type,...,retrieved_on,score,send_replies,stickied,subreddit,subreddit_id,total_awards_received,author_cakeday,distinguished,year
0,[],,bi-partisian-mitch,,,[],,,,text,...,1585433141,1,True,False,politics,t5_2cneq,0,,,2020
1,[],,TheRealboi77,,,[],,,,text,...,1585433140,1,True,False,politics,t5_2cneq,0,,,2020
2,[],,eatdeadjesus,,,[],,,,text,...,1585433138,1,True,False,politics,t5_2cneq,0,,,2020
3,[],,IRULETHISREDDIT,,,[],,,,text,...,1585433138,1,True,False,politics,t5_2cneq,0,,,2020
4,[],,KVFDFAC,,,[],,,,text,...,1585433137,1,True,False,politics,t5_2cneq,0,,,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,[],,LavenderTed,,,[],,,,text,...,1585433841,1,True,False,politics,t5_2cneq,0,,,2020
996,[],,SuzieQ4624,,ohio-flag,"[{'a': ':flag-oh:', 'e': 'emoji', 'u': 'https:...",cfb18876-8e72-11e6-86eb-0e41ffa1e583,:flag-oh: Ohio,dark,richtext,...,1585433841,1,True,False,politics,t5_2cneq,0,,,2020
997,[],,byte_alchemist,,,[],,,,text,...,1585433840,1,True,False,politics,t5_2cneq,0,,,2020
998,[],,MadaMadaDesu,,,[],,,,text,...,1585433840,1,True,False,politics,t5_2cneq,0,,,2020


In [33]:
poli_2010_df.to_csv('./data/poli_2010.csv', index=False)

In [34]:
poli_2020_df.to_csv('./data/poli_2020.csv', index=False)

### Political discussion subreddit

In [38]:
poli_dis_2010_df = get_posts(10, 'PoliticalDiscussion', 1332973121)

In [39]:
poli_dis_2020_df = get_posts(10, 'PoliticalDiscussion', 1585433921)

In [41]:
poli_dis_2010_df['year'] = '2010'
poli_dis_2020_df['year'] = '2020'

In [42]:
poli_dis_2010_df.to_csv('./data/poli_dis_2010.csv', index=False)
poli_dis_2020_df.to_csv('./data/poli_dis_2020.csv', index=False)