# Project 3: Web APIs & Classification
## 01. Reddit Scraper
Resources referenced to help create scraper:
- [pushshift.io API Documentation](https://github.com/pushshift/api)
- [Reddit thread: How would I get all user's comments?](https://www.reddit.com/r/pushshift/comments/9xh1b1/how_would_i_get_all_users_comments/)
- [How to scrape Reddit with Python](http://www.storybench.org/how-to-scrape-reddit-with-python/) (To set up timestamp)

In [1]:
# Import libraries
import requests
import time
import pandas as pd
import datetime as dt

In [2]:
# Select subreddit to scrape
subreddit = 'TheOnion'

In [3]:
# Set up dict for info to collect
posts_data = {'created_utc':[],
              'url':[],
              'id':[],
              'num_comments':[],
              'title':[],
              'subreddit':[]
              }

headers = {'User-agent': 'Reddit Post Collector'}

# Set up function to return submission data
def get_submissions(**kwargs):
    res = requests.get("https://api.pushshift.io/reddit/submission/search/",
                       params=kwargs,
                       headers=headers)
    if res.status_code == 200:
        data = res.json()
        return data['data']
    else:
        print(res.status_code)

before = None
count = 0

# Collect up to 10,000 posts as long as there are posts to collect
while True and len(set(posts_data['created_utc'])) < 9500:
    print(count)
    count += 1*500
    
    posts = get_submissions(subreddit=subreddit,
                            size=500,
                            before=before,
                            sort='desc',
                            sort_type='created_utc')
    if not posts:
        break

    for post in posts:
        # Keep track of position for the next call in while loop
        before = post['created_utc']

        # Append info to posts_data dict
        posts_data['created_utc'].append(post['created_utc'])
        posts_data['url'].append(post['full_link'])
        posts_data['id'].append(post['id'])
        posts_data['num_comments'].append(post['num_comments'])
        posts_data['title'].append(post['title'])
        posts_data['subreddit'].append(post['subreddit'])

    time.sleep(1)

# Save posts to dataframe
posts_data = pd.DataFrame(posts_data)

# Create `timestamp` column with `created_utc` translated into readable time
def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = posts_data['created_utc'].apply(get_date)
posts_data = posts_data.assign(timestamp = _timestamp)

# Export to csv
filetime = time.strftime("%y%m%d_%H%M%S", time.localtime())
posts_data.to_csv('./data/{}_{}.csv'.format(subreddit, filetime), index=False)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500


In [4]:
posts_data.head()

Unnamed: 0,created_utc,url,id,num_comments,title,subreddit,timestamp
0,1545089481,https://www.reddit.com/r/TheOnion/comments/a75...,a75a2d,0,Maria Butina Pleads Guilty To Russian Scheme T...,TheOnion,2018-12-17 18:31:21
1,1545083658,https://www.reddit.com/r/TheOnion/comments/a74...,a74ecp,0,Drew Brees said WHAT?!,TheOnion,2018-12-17 16:54:18
2,1545082273,https://www.reddit.com/r/TheOnion/comments/a74...,a746b4,0,Ryan Zinke Apologizes For Misuse Of Government...,TheOnion,2018-12-17 16:31:13
3,1545075093,https://www.reddit.com/r/TheOnion/comments/a73...,a730yh,0,Report: Re-mixxxx!,TheOnion,2018-12-17 14:31:33
4,1545074578,https://www.reddit.com/r/TheOnion/comments/a72...,a72xu7,0,Trump Administration Launches Human Rights Inv...,TheOnion,2018-12-17 14:22:58


In [5]:
posts_data.shape

(10000, 7)