# 1 - Project 3 Data Collection
This is the first of a series of notebooks for this project.

Note that subsquent project notebooks will refer to this notebook.  As running this notebook will provide data for a fixed point in time, and could potentially over-write data collected previously, this will be treated as a stand-alone notebook.  Timestamps will be printed to a csv so that this code can be modified to duplicate the run shown in this notebook.

In [3]:
import requests
import pandas as pd

# Sourece to get current Unix Timestamp for the pushshift API:
#  https://stackoverflow.com/questions/16755394/what-is-the-easiest-way-to-get-current-gmt-time-in-unix-timestamp-format
import time

In [33]:
def data_getter(subreddit, trials, name):
#=====  INITIAL LOCAL VARIABLES  =================================================================================
    
    # Establish base url:
    url_fnc = 'https://api.pushshift.io/reddit/search/submission'
    
    # Establish initial parameters for most current subreddit pull:
    params_getter = {
    'subreddit': subreddit,
    'size': 1000,
    }
    
    # Create an empty dataframe to which we can concatenate each run
    master_df = pd.DataFrame()
    
    # Create the first instance of the timestamp list, and establish current time
    right_now = round(time.time())
    pull_times = [right_now]
    
#=====  Gathering the data from reddit/pushshift // Create DataFrame // Concatenate to Master  ===================
    
    # Get the data:
    res_fnc = requests.get(url_fnc, params_getter)
    print(res_fnc.status_code) # for debugging
    
    # Make dataframe:
    df = res_fnc.json() # Dump dat to a json
    df = pd.DataFrame(df['data']) # Pull the 'data' dictionary out of the json
    
    # Concatenate to master
    master_df = pd.concat([master_df, df])

#=====  Iterate the above steps over remaining trials  ===========================================================
    for i in range(0, (trials - 1)): # Establishes a for loop to iterate over the remaining trials (minus the first one)
        
        # Update the before parameter to be the retrieved time (in utc) of the last item in the previous trail's dataframe
        params_getter = {
        'subreddit': subreddit,
        'size': 1000,
        'before': list(df['retrieved_utc'][-1:])[0]
        }
        
        # Get the data:
        res_fnc = requests.get(url_fnc, params_getter)
        print(res_fnc.status_code) # for debugging

        # Make dataframe:
        df = res_fnc.json() # Dump dat to a json
        df = pd.DataFrame(df['data']) # Pull the 'data' dictionary out of the json

        # Concatenate to master
        master_df = pd.concat([master_df, df])
        
        # Add pull time to pull_times list:
        pull_times.append(list(df['retrieved_utc'][-1:])[0])
        
#=====  Create a text file with all the pull times for replicability =====================================================================
    # Source inspring this code:  https://www.guru99.com/reading-and-writing-files-in-python.html
    f= open(f'../data/{name}_pulltimes_{right_now}.txt',"w+")
    f.write(f'{pull_times}')
    f.close()

#=====  Finally, return the fully concatenated master dataframe, reset index, store to csv  =================================================
    master_df.reset_index(drop = True) # Source for refresher on how to tuse this:  https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html
    master_df.to_csv(f'../data/{name}_{right_now}.csv')
    return master_df.head()

In [34]:
data_getter('theonion', 6, 'theonion')

200
200
200
200
200
200


Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,...,preview,is_gallery,media_metadata,gallery_data,crosspost_parent_list,crosspost_parent,author_created_utc,retrieved_on,call_to_action,author_cakeday
0,TheOnion,,t2_3jamc,0,Coy Biden Appears Nude Behind Folding Fan To T...,[],r/TheOnion,False,6,,...,,,,,,,,,,
1,TheOnion,,t2_3jamc,0,Authorities Report Increase In Threats From An...,[],r/TheOnion,False,6,,...,{'images': [{'source': {'url': 'https://extern...,,,,,,,,,
2,TheOnion,,t2_3jamc,0,Midwestern Woman Can’t Believe It’s Snowing Wh...,[],r/TheOnion,False,6,,...,{'images': [{'source': {'url': 'https://extern...,,,,,,,,,
3,TheOnion,,t2_3jamc,0,Quiz: Could You Pass The SpaceX Aptitude Test?,[],r/TheOnion,False,6,,...,{'images': [{'source': {'url': 'https://extern...,,,,,,,,,
4,TheOnion,,t2_3jamc,0,Mark Cuban Rejects ‘Shark Tank’ Pitch To Make ...,[],r/TheOnion,False,6,,...,{'images': [{'source': {'url': 'https://extern...,,,,,,,,,


In [35]:
data_getter('worldnews', 6, 'worldnews')

200
200
200
200
200
200


Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,...,is_video,retrieved_utc,updated_utc,utc_datetime_str,post_hint,preview,crosspost_parent_list,crosspost_parent,author_cakeday,link_flair_template_id
0,worldnews,,t2_9q0nrrux9,0,See adorable cheetah cubs born in India for th...,[],r/worldnews,False,6,,...,False,1682118700,1682118700,2023-04-21 23:11:28,,,,,,
1,worldnews,,t2_vuvthues,0,Australia announces pathway to citizenship for...,[],r/worldnews,False,6,,...,False,1682118507,1682118507,2023-04-21 23:08:10,link,{'images': [{'source': {'url': 'https://extern...,,,,
2,worldnews,,t2_5axjiycc,0,Lachlan Murdoch drops defamation proceedings a...,"[{'e': 'text', 't': 'Not Appropriate Subreddit'}]",r/worldnews,False,6,normal,...,False,1682118429,1682118430,2023-04-21 23:06:53,link,{'images': [{'source': {'url': 'https://extern...,"[{'approved_at_utc': None, 'subreddit': 'frien...",t3_12tn6uu,,
3,worldnews,,t2_9q0nrrux9,0,SE Cupp says 'no one should care' about GOP ou...,"[{'e': 'text', 't': 'US internal politics'}]",r/worldnews,False,6,normal,...,False,1682118329,1682118329,2023-04-21 23:05:18,,,,,,
4,worldnews,,t2_dkucn,0,US Supreme Court preserves abortion drug access,"[{'e': 'text', 't': 'US internal news'}]",r/worldnews,False,6,normal,...,False,1682118257,1682118257,2023-04-21 23:04:01,link,{'images': [{'source': {'url': 'https://extern...,,,,
