# Pushshift API

### Required imports:

* python == 3.6.4

* html5lib == 1.0.1
* jupyter == 1.0.0
* matplotlib == 2.1.2
* numpy == 1.14.0
* pandas == 0.22.0
* requests == 2.18.4
* scikit-learn == 0.19.1
* seaborn == 0.8.1

Video: https://youtu.be/AcrjEWsMi_E

In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import requests

import seaborn as sns
import time

# Time problem for the project pulling data with API

for i in range(10):
    print(i)
    time.sleep(3) # number is in seconds

# API Here:

url = 'https://api.pushshift.io/reddit/search/submission'

List of UTC:

'NewsOfTheStupid':
* 1617028769
* 1616964163
* 1616954844
* 1616954642
* 1616952393
* 1616947482
* 1616947296
* 1616943098
* 1616931067

'NewsOfTheWeird':  
* 1617033464 
* 1617031243 
* 1617004237 
* 1616994928
* 1616987479
* 1616957554
* 1616955162
* 1616953971
* 1616924270

In [3]:
# We use parameters to add to search to filter our search to the subreddit we want

params = {
    'subreddit': 'NewsOfTheWeird',
    'size': 100, 
    'before': 1616953971
}

In [4]:
# We need to set up our request for our params and url by setting it to response

res = requests.get(url, params)

In [5]:
# Get status code - should be 200 when it's good 

res.status_code

200

In [6]:
# Call this if you want to get content to get back our dictionary

#res.json()

In [7]:
data = res.json()

In [8]:
# Looking at data we can see that it's a dictionary with data being the top 25 posts

#data['data']

In [9]:
# Translatable to this since it's a list:

posts = data['data']

In [10]:
# So we don't keep drawing the same posts we need to translate UTC integer using 

# https://www.epochconverter.com/

# Grab 'created_utc' from first post to param:

posts[0]

{'all_awardings': [],
 'allow_live_comments': False,
 'author': 'slowslide69',
 'author_flair_css_class': None,
 'author_flair_richtext': [],
 'author_flair_text': None,
 'author_flair_type': 'text',
 'author_fullname': 't2_84i1tgik',
 'author_patreon_flair': False,
 'author_premium': False,
 'awarders': [],
 'can_mod_post': False,
 'contest_mode': False,
 'created_utc': 1616924270,
 'domain': 'dazeddigital.com',
 'full_link': 'https://www.reddit.com/r/NewsOfTheWeird/comments/meyav3/this_wild_election_manifesto_promises_free_trips/',
 'gildings': {},
 'id': 'meyav3',
 'is_crosspostable': True,
 'is_meta': False,
 'is_original_content': False,
 'is_reddit_media_domain': False,
 'is_robot_indexable': True,
 'is_self': False,
 'is_video': False,
 'link_flair_background_color': '',
 'link_flair_richtext': [],
 'link_flair_text_color': 'dark',
 'link_flair_type': 'text',
 'locked': False,
 'media_only': False,
 'no_follow': True,
 'num_comments': 0,
 'num_crossposts': 0,
 'over_18': False,


In [11]:
# Figure out length of posts form pushshift API ('size' Param Default: 25)

#len(posts)

In [12]:
# If you got list back then you have a pandas dataframe

df = pd.DataFrame(posts)

In [13]:
# All the keys from the dictionary become your column headers; values corresponding with json

df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_patreon_flair,author_premium,...,url_overridden_by_dest,whitelist_status,wls,removed_by_category,media,media_embed,secure_media,secure_media_embed,author_flair_background_color,author_flair_text_color
0,[],False,slowslide69,,[],,text,t2_84i1tgik,False,False,...,https://www.dazeddigital.com/politics/article/...,all_ads,6,,,,,,,
1,[],False,lnhax_com,,[],,text,t2_1uzw3jfi,False,False,...,https://biggboss14.tellygupshup.com/after-rakh...,all_ads,6,reddit,,,,,,
2,[],False,story20teller,,[],,text,t2_52rzfsoq,False,False,...,https://www.albawaba.com/node/curse-pharaohs-i...,all_ads,6,,,,,,,
3,[],False,young_stock_trader,,[],,text,t2_84ugbss4,False,False,...,https://www.dailydot.com/unclick/ben-10-propos...,all_ads,6,,,,,,,
4,[],False,tyw7,,[],,text,t2_28tho63f,False,True,...,https://www.dorsetecho.co.uk/news/19187703.thi...,all_ads,6,,,,,,,


In [14]:
# two important columns to help build out our model (what subreddit it came from and )

df[['subreddit', 'selftext', 'title']].head()

Unnamed: 0,subreddit,selftext,title
0,NewsOfTheWeird,,This wild election manifesto promises free tri...
1,NewsOfTheWeird,,"After Rakhi Sawant, Arshi Khan to have a Swaya..."
2,NewsOfTheWeird,,Egypt: The Curse of the Pharaohs Is Coming Tru...
3,NewsOfTheWeird,,Brutal video shows man getting rejected after ...
4,NewsOfTheWeird,,Thieves sabotage Covid vaccination clinic for ...


# Save df as a csv

df.to_csv('./data/stupid_10.csv')