# Hackathon Data Gathering

In [1]:
import pandas as pd
import datetime as dt
import time
import requests

In [16]:
# adapted from Mahdi Shadkam-Farrokhi's project 3 Intro Lesson

def query_pushshift(subreddit, query_list, kind = 'comment', day_window = 60): # 30 day
    SUBFIELDS = ['author', 'body', 'created_utc', 'subreddit', 'permalink']
    
    # establish base url and stem
    BASE_URL = f"https://api.pushshift.io/reddit/search/{kind}" # also known as the "API endpoint" 
    stem1 = f"{BASE_URL}?q="
    stem2 = f"&subreddit={subreddit}&size=500&after={day_window}d" # always pulling max of 500
    
    # instantiate empty list for temp storage
    posts = []
    
    # implement for loop with `time.sleep(2)`
    for query in query_list:
        URL = "{}{}{}".format(stem1, query, stem2)
        print("Querying from: " + URL)
        response = requests.get(URL)
        assert response.status_code == 200
        mine = response.json()['data']
        df = pd.DataFrame.from_dict(mine)
        posts.append(df)
        time.sleep(2)
    
    # pd.concat storage list
    full = pd.concat(posts, sort=False)
    
    # if submission
    if kind == "comment":
        # select desired columns
        full = full[SUBFIELDS]
        # drop duplicates
        full.drop_duplicates(inplace = True)
        

    # create `timestamp` column
    full['timestamp'] = full["created_utc"].map(dt.date.fromtimestamp)
    
    print("Query Complete!")    
    return full 

In [23]:
queries = {'event','sports','safe','safety','attend','game','concert','feel','fan','protocols','measures','concern','danger','dangerous','unsafe','social', 'unsafe'}

In [24]:
events_df = query_pushshift('coronavirus',queries)

Querying from: https://api.pushshift.io/reddit/search/comment?q=measures&subreddit=coronavirus&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/comment?q=safe&subreddit=coronavirus&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/comment?q=game&subreddit=coronavirus&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/comment?q=event&subreddit=coronavirus&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/comment?q=attend&subreddit=coronavirus&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/comment?q=danger&subreddit=coronavirus&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/comment?q=dangerous&subreddit=coronavirus&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/comment?q=unsafe&subreddit=coronavirus&size=500&after=60d
Querying from: https://api.pushshift.io/reddit/search/comment?q=social&subreddit=coronavirus&size=500&af

In [22]:
events_df

Unnamed: 0,author,body,created_utc,subreddit,permalink,timestamp
0,Crossx1x,"Boi, many people who are in the school staff a...",1590177149,Coronavirus,/r/Coronavirus/comments/goh9o9/schools_row_war...,2020-05-22
1,DjMagicTouch,You're thinking in extremes man. Just like eve...,1590177465,Coronavirus,/r/Coronavirus/comments/gojqi1/daily_discussio...,2020-05-22
2,lisaseileise,"No, what is so hard to understand here? It‘s j...",1590177616,Coronavirus,/r/Coronavirus/comments/gonje6/the_coronavirus...,2020-05-22
3,jeopardy987987,"it's killed nearly 100,000 in the US in just a...",1590177796,Coronavirus,/r/Coronavirus/comments/gokm4t/a_majority_of_a...,2020-05-22
4,gp_dude,You are living in an alternate reality if you ...,1590177806,Coronavirus,/r/Coronavirus/comments/goql7x/sweden_escapes_...,2020-05-22
...,...,...,...,...,...,...
95,BigBrownBearCub,Wife and I both wear masks at ALL times outsid...,1590189592,Coronavirus,/r/Coronavirus/comments/gorhkq/majority_of_ame...,2020-05-22
96,denali12,That's what I do! That makes me feel less guilty.,1590189621,Coronavirus,/r/Coronavirus/comments/gojqi1/daily_discussio...,2020-05-22
97,charlami,I'm a truck driver and I cook what I can in th...,1590189751,Coronavirus,/r/Coronavirus/comments/gokm4t/a_majority_of_a...,2020-05-22
98,lpukas2,My YMCA opened up the cardio and free weight a...,1590189794,Coronavirus,/r/Coronavirus/comments/gokm4t/a_majority_of_a...,2020-05-22


In [25]:
# write to CSV 
events_df.to_csv('./data/events_reddit.csv', index=False)