# Imports

In [1]:
import requests
import pandas as pd
import json
import datetime
import time

# Scraping Function Prep

In [7]:
# list of subreddit names as strings

sr = ['nosleep', 'LetsNotMeet']

In [3]:
# list of dates to feed into function; function moves in 30.44 day (one month) intervals

dates = ['07/30/2021', '06/30/2021', '05/30/2021', '04/30/2021', '03/30/2021', '02/28/2021', 
         '01/30/2021', '12/30/2020', '11/30/2020', '10/30/2020', '09/30/2020', '08/30/2020', 
         '07/30/2020', '06/30/2020', '05/30/2020', '04/30/2020', '03/30/2020', '02/28/2020', 
         '01/30/2020', '12/30/2019', '11/30/2019', '10/30/2019', '09/30/2019', '08/30/2019', 
         '07/30/2019', '06/30/2019', '05/30/2019', '04/30/2019', '03/30/2019', '02/28/2019', 
         '01/30/2019', '12/30/2018', '11/30/2018', '10/30/2018', '09/30/2018', '08/30/2018', 
         '07/30/2018', '06/30/2018', '05/30/2018', '04/30/2018', '03/30/2018', '02/28/2018', 
         '01/30/2018', '12/30/2017', '11/30/2017', '10/30/2017', '09/30/2017', '08/30/2017', 
         '07/30/2017', '06/30/2017', '05/30/2017', '04/30/2017', '03/30/2017', '02/28/2017', 
         '01/30/2017']

# Time Conversion and Scraping Functions

In [4]:
# convert 'human time' to epoch time

def convert_date(date_string):
    date = datetime.datetime.strptime(date_string, "%m/%d/%Y")
    converted = int(datetime.datetime.timestamp(date))
    return converted

In [5]:
# create pushshift url using subreddit of choice and list of dates (function moves in monthly intervals)
# scrape subreddits, create count of scraped posts, create dataframe of subreddit data

def requesting(subreddit_list, dates_list):
    subs_data = []
    posts_count = 0
    
    for sub in sr:
        for date in dates:
            convert = convert_date(date)
            base_url = 'https://api.pushshift.io/reddit/search/submission'
            url_adds = f'{base_url}?subreddit={sub}&before={convert}&after={convert - 2629743}&size=5000'
            req = requests.get(url_adds)    
        
            if req.status_code == 200:
                print(f'STATUS CODE: {req.status_code}')
            else:
                print('ERROR')
        
            print(f'Currently working through /r/{sub}, {date}')
            scrape = req.json()['data']
            subs_data.append(pd.DataFrame(scrape))
            posts_count += len(scrape)
            time.sleep(2)
        
    all_data = pd.concat(subs_data)
    print('Total number of posts collected:', posts_count)
    all_data.to_csv('./data/total_subreddit_data.csv', index=False)

# Scraping

In [6]:
requesting(sr, dates)

STATUS CODE: 200
Currently working through /r/nosleep, 07/30/2021
STATUS CODE: 200
Currently working through /r/nosleep, 06/30/2021
STATUS CODE: 200
Currently working through /r/nosleep, 05/30/2021
STATUS CODE: 200
Currently working through /r/nosleep, 04/30/2021
STATUS CODE: 200
Currently working through /r/nosleep, 03/30/2021
STATUS CODE: 200
Currently working through /r/nosleep, 02/28/2021
STATUS CODE: 200
Currently working through /r/nosleep, 01/30/2021
STATUS CODE: 200
Currently working through /r/nosleep, 12/30/2020
STATUS CODE: 200
Currently working through /r/nosleep, 11/30/2020
STATUS CODE: 200
Currently working through /r/nosleep, 10/30/2020
STATUS CODE: 200
Currently working through /r/nosleep, 09/30/2020
STATUS CODE: 200
Currently working through /r/nosleep, 08/30/2020
STATUS CODE: 200
Currently working through /r/nosleep, 07/30/2020
STATUS CODE: 200
Currently working through /r/nosleep, 06/30/2020
STATUS CODE: 200
Currently working through /r/nosleep, 05/30/2020
STATUS COD