In [4]:
import requests
import pandas as pd
# https://stackoverflow.com/questions/14742089/time-a-while-loop-python
# for timing loops
import time
import datetime

___
### Get Reddit Posts from Subreddits


In [3]:
def pull_reddit_posts(subreddits:set):
    """
    This function is designed to take in a set of subreddits, connect to the pushshift api,
    and pull all submissions by setting a lowerbound of choice as start_utc. Utilizes a while
    loop to get posts using requests and has checks inside for:
        1. If data is not in the response json
        2. If the last post of the most recent pull is less than the lower bounds
    
    Uses the last observation/row from the most recent get call to set the before filter parameter.
    
    Once all posts have been collected and/or the loop breaks, save all posts as a dataframe,
    and save to csv in project data folder.

    """
    posts = []

    for subreddit in subreddits:
        print("this is subreddit", subreddit)
        # save URL based on current subreddit
        url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size=500"
        # set date as lower bounds, see if we can limit calls by only pulling since 2015
        start_utc = datetime.datetime(2015, 1, 1).timestamp()
        print("start_utc", start_utc)
        while True:
            start_time = time.time()
            # call requests, save response
            response = requests.get(url)
            # save json as data
            data = response.json()
            # if data key not in response, break loop
            if len(data['data']) == 0:
                break
            # else, add data to posts list
            posts += data['data']
            print("posts length", len(posts))
            # pull last post in most recent data pull
            last_post = data['data'][-1]
            # save created_utc for gate and next url
            created_utc = last_post['created_utc']
            print("created_utc", created_utc)
            # current last post utc less than lower bounds, break loop
            if created_utc < start_utc:
                break
            # if loop hasn't broken, set url for next pull passing created_utc as new 'before' parameter
            url = f"https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size=500&before={created_utc}"
            end_time = time.time()
            print("total time taken this loop: ", end_time-start_time)

    df = pd.DataFrame(posts)
    df.to_csv(f'../data/{"_".join(set((subreddits)))}.csv')

In [4]:
pull_reddit_posts(set(("NVIDIA","AMD")))

this is subreddit NVIDIA
start_utc 1420088400.0
posts length 500
created_utc 1679305849
total time taken this loop:  1.5016908645629883
posts length 999
created_utc 1678763598
total time taken this loop:  2.027661085128784
posts length 1493
created_utc 1678141265
total time taken this loop:  2.4184648990631104
posts length 1993
created_utc 1677633429
total time taken this loop:  2.1692891120910645
posts length 2492
created_utc 1677025852
total time taken this loop:  2.0452969074249268
posts length 2991
created_utc 1676441057
total time taken this loop:  2.670185089111328
posts length 3491
created_utc 1675964493
total time taken this loop:  2.151883602142334
posts length 3989
created_utc 1675528359
total time taken this loop:  2.5647170543670654
posts length 4488
created_utc 1675054489
total time taken this loop:  2.6114439964294434
posts length 4986
created_utc 1674565845
total time taken this loop:  2.255218982696533
posts length 5486
created_utc 1674065944
total time taken this loop:

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,selftext,author_fullname,gilded,title,link_flair_richtext,subreddit_name_prefixed,hidden,pwls,...,saved,selftext_html,ups,user_reports,brand_safe,approved_at_utc,banned_at_utc,from_kind,from_id,from
0,0,nvidia,[removed],t2_7ryvc7f3e,0.0,Need advice about 4090,"[{'e': 'text', 't': 'Question'}]",r/nvidia,False,6.0,...,,,,,,,,,,
1,1,nvidia,Which should I use and are they both safe? [Pr...,t2_s6iy0sui,0.0,NvidiaProfileInspector or NvidiaInspector?,"[{'e': 'text', 't': 'Question'}]",r/nvidia,False,6.0,...,,,,,,,,,,
2,2,nvidia,before march fifth my vids looked like this \n...,t2_7yc3wcn9,0.0,shadowplay is wierdpease help me,"[{'e': 'text', 't': 'Question'}]",r/nvidia,False,6.0,...,,,,,,,,,,
3,3,nvidia,,t2_3gkjrjff,0.0,New CableMod cable came in! Does the 16pin ada...,"[{'e': 'text', 't': 'Discussion'}]",r/nvidia,False,6.0,...,,,,,,,,,,
4,4,nvidia,CPU: Intel Core i7-8700 3.20GHz\nRAM: 16GB\nGP...,t2_tjzwh4fi,0.0,Nvidia Gefore GTX 1060 6gb replacement options?,"[{'e': 'text', 't': 'Question'}]",r/nvidia,False,6.0,...,,,,,,,,,,


In [10]:
df.shape

(408942, 126)