In [1]:
import pandas as pd 
import datetime
import csv
import os 
import requests 
import datetime as dt
import time

# Exercise

For your exercise do the following:

1. Choose a reddit page you want to crawl
2. The following fields should be present when you crawl **(10 points)**:
    - author
    - subreddit
    - date created 
    - number of comments 
    - score
    - submission title 
    - submission description
3. After crawling, save your results to a pandas dataframe **(3 points)**. 
4. Answer the following questions **(12 points)**:
    - How many submissions were you able to gather? 
    - Who has the most submissions? 
    - Which submission has the highest score? 
    - Which submission has the highest number of comments?
    - Which day of the week has the most submissions? 
    
**Tip:** _For item#4, recall how to use the aggregation functions in `pandas` like count, value_counts, sum, etc. For getting the day of the week, look into how to get the `dayofweek` from a datetime object in `pandas`. (Hint: You may need to use `pd.to_datetime` to convert your date column...)_

In [2]:
def utc_to_unix(date):
    '''Forces a timestamp into the UTC timezone and converts it to a UNIX epoch'''
    return int(date.replace(tzinfo=dt.timezone.utc).timestamp())

def unix_to_utc(unix):
    '''Converts a UNIX epoch to a UTC Timestamp'''
    return datetime.datetime.utcfromtimestamp(unix).strftime('%Y-%m-%d %H:%M:%S')
#prepare the API call
url = "https://api.pushshift.io/reddit/submission/search/"
subreddit = 'ProRevenge' #/r/ProRevenge subreddit
fields = ['author', 'subreddit','created_utc','num_comments','score','title','selftext'] #required fields as per exercise instructions
sort_type = 'created_utc'
sort = 'asc'
size = 500 

#Declare start and end of reddit posts to extract 
start_date = dt.datetime.strptime("2019-01-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2020-01-01", "%Y-%m-%d")

URL = "https://api.pushshift.io/reddit/submission/search/"  #query submissions
PARAMS = {
    'after': utc_to_unix(start_date)-1, #get dates after Jan 1, 2019 UTC
    'before': utc_to_unix(end_date), #get dates before Jan 1, 2020 UTC
    'sort_type': sort_type, # sort by created_utc
    'sort': sort, # sort in descending order
    'subreddit': subreddit, # do a search on ProRevenge subreddit
    'size': size, # give only 500 search results
    'fields': fields #return only the following fields
}

In [3]:
#Setup blank list
results = []

#loop while date range not fulfilled
while PARAMS['after'] < PARAMS['before']:
    #use the requests library to query pushshift api
    r = requests.get(url = URL, params = PARAMS)
    
    if r.json()['data'] == []:
        break
        
    #extend list results
    results.extend(r.json()['data'])

    #change start_time
    start_date = r.json()['data'][-1]['created_utc'] # this sets new start time to the last timestamp in the result array
    PARAMS['after'] = start_date
    print('Done until {} - Result Size {} - Total Results Size {}'.format(unix_to_utc(start_date), len(r.json()['data']), len(results)))
    time.sleep(1)

Done until 2019-01-15 10:14:02 - Result Size 100 - Total Results Size 100
Done until 2019-01-30 00:07:07 - Result Size 100 - Total Results Size 200
Done until 2019-02-13 15:05:33 - Result Size 100 - Total Results Size 300
Done until 2019-02-17 09:05:14 - Result Size 100 - Total Results Size 400
Done until 2019-02-18 19:31:47 - Result Size 100 - Total Results Size 500
Done until 2019-02-19 23:36:57 - Result Size 100 - Total Results Size 600
Done until 2019-02-21 02:00:10 - Result Size 100 - Total Results Size 700
Done until 2019-02-21 23:38:23 - Result Size 100 - Total Results Size 800
Done until 2019-02-23 04:46:35 - Result Size 100 - Total Results Size 900
Done until 2019-02-24 03:13:30 - Result Size 100 - Total Results Size 1000
Done until 2019-02-24 22:18:53 - Result Size 100 - Total Results Size 1100
Done until 2019-02-25 20:38:57 - Result Size 100 - Total Results Size 1200
Done until 2019-02-26 19:03:11 - Result Size 100 - Total Results Size 1300
Done until 2019-02-27 18:56:43 - R

In [4]:
#normalizes results json into a dataframe
df = pd.json_normalize(results)
#saves the dataframe for sanity
df.to_csv('reddit_scrape.csv', index=False)

In [5]:
#This is here because I reopened data at a different time
df = pd.read_csv('reddit_scrape.csv')
df.head()

Unnamed: 0,author,created_utc,num_comments,score,selftext,subreddit,title
0,pijgerk,1546308387,22,1,I couldn't stand Justin. My father did not lik...,ProRevenge,Ruin most of my childhood by bullying me with ...
1,sharknadotornado,1546312716,11,1,"There is so much to this story, but I'll try t...",ProRevenge,Neighbor makes the mistake of believing he can...
2,Shablagoo-,1546353771,3,1,https://www.reddit.com/r/legaladvice/comments/...,ProRevenge,Redditor tries to file a complaint against a p...
3,snackerjack7331,1546354687,68,1,"This is an x-post from r/confession, a few peo...",ProRevenge,I scrubbed the toilet with my stepmom’s toothb...
4,TheMightyAddicted,1546361231,55,1,"So this history is not pro-revenge, but i thin...",ProRevenge,Trying to get high as a kite? Enjoy the trip


### Answer the following questions (12 points):
* How many submissions were you able to gather?
* Who has the most submissions?
* Which submission has the highest score?
* Which submission has the highest number of comments?
* Which day of the week has the most submissions?

In [6]:
# How many submissions were you able to gather?
total_submissions = len(df)
print('Total 2019 Submissions: {}'.format(total_submissions))

Total 2019 Submissions: 10875


In [7]:
# Who has the most submissions?
most_active = df['author'].value_counts().reset_index()[0:2]
print('Author with most submissions in 2019: {}'.format(most_active['index'][0]))

Author with most submissions in 2019: Ford456fgfd


In [8]:
# Which submission has the highest score?
top_submission = df.sort_values(by=['score'],ascending=False,ignore_index=True).head(1)
print('The top submission of 2019 in the ProRevenge subreddit is: \n\n{} by {}'.format(top_submission['title'][0],top_submission['author'][0]))

The top submission of 2019 in the ProRevenge subreddit is: 

Coworker tried to get me fired over breast implants, so I pulled a reverse uno card. by 3240278189


In [9]:
#Which submission has the highest number of comments?
top_commented = df.sort_values(by=['num_comments'],ascending=False,ignore_index=True).head(1)
print('The top submission of 2019 in the ProRevenge subreddit is: \n\n{} by {}'.format(top_commented['title'][0],top_commented['author'][0]))

The top submission of 2019 in the ProRevenge subreddit is: 

Boyfriend of 5 years cheated on me so I ruined his precious RuneScape account by osrsbitch19


In [10]:
# Which day of the week has the most submissions?
df['dayname'] = pd.to_datetime(df['created_utc'], unit='s').dt.day_name() #creates dayname column
most_active_day = df['dayname'].value_counts().reset_index()[0:2]
print('Most active day in 2019: {}'.format(most_active_day['index'][0]))

Most active day in 2019: Monday
