# 01: Web Scraping
---

## 1. Imports

In [2]:
import pandas as pd
import requests
import time

---
## 2. Web Scrape Function

In [3]:
def get_data(subreddit, n_iter):
    """Returns a concatenated DataFrame for a given subreddit. Given that the Pushshift API limits to 
    100 submissions at a time, it will loop as many times as provided by the n_iter parameter."""
    
    # Initializing an empty list that will contain all DataFrames to concatenate
    df_list = []
    
    # Establishing the unix time to start with
    current_time = 1652224086
    
    # Creating a for loop that will create n_iter DataFrames of 100 submissions each, and will use and 
    # update the current_time in order to pull the 100 submissions previous to the ones pulled in the
    # previous iteration.
    for _ in range(n_iter):
        response = requests.get(
            'http://api.pushshift.io/reddit/search/submission', 
            params={'subreddit': subreddit, 'size': 100, 'before': current_time}
        )
       
        # Setting time in seconds to wait before executing next iteration to prevent exceeding the API limit
        time.sleep(3)
        data = response.json()
        submissions = data['data']
        df = pd.DataFrame(submissions)
        
        # Specifying the features to be included in the returned DataFrame
        df = df.loc[:, ['subreddit', 'title', 'selftext', 'created_utc']]
        df_list.append(df)
        
        # Re-setting the current time to that of the oldest time  in the current iteration
        current_time = df.created_utc.min()
        
    return pd.concat(df_list, axis=0)

---
## 3. Creating Datasets (~ 5,000 submissions each)

In [3]:
plant_based = get_data('PlantBasedDiet', 50).reset_index().drop(columns='index')

In [5]:
plant_based

Unnamed: 0,subreddit,title,selftext,created_utc
0,PlantBasedDiet,I'm having so many health problems from this diet,"**Warning, long post ahead** Okay so I had a ...",1652222200
1,PlantBasedDiet,Mercy For Animals encourages White House suppo...,,1652215236
2,PlantBasedDiet,Struggling with social &amp; familial stigma/b...,[removed],1652215221
3,PlantBasedDiet,Ideas for lunch to take to uni,[removed],1652213780
4,PlantBasedDiet,$10-25k grants to promote climate-friendly pla...,Not sure if this is the best sub but I saw tha...,1652210182
...,...,...,...,...
4994,PlantBasedDiet,Light easy meal,,1617475971
4995,PlantBasedDiet,"Following PBD with (for) PCOS, any advice",Hi Everyone!\n\nI've started following a vegan...,1617471936
4996,PlantBasedDiet,Cookbooks,[removed],1617469132
4997,PlantBasedDiet,"How to soak, cook dried Kidney Beans?",[removed],1617466938


In [5]:
paleo = get_data('Paleo', 50).reset_index().drop(columns='index')

In [6]:
paleo

Unnamed: 0,subreddit,title,selftext,created_utc
0,Paleo,This is my meal for the day. Let me hope that ...,,1652208666
1,Paleo,Coconut/Cassava/Arrowroot flours in Shakes for...,I am working out like crazy and need to up my ...,1652205599
2,Paleo,I’ve been using monk fruit sweetener for every...,,1652204018
3,Paleo,Grain free UNSWEETENED granola I can buy? Reco...,,1652203919
4,Paleo,Bananas! They aren’t paleo bc of the sugar con...,,1652203879
...,...,...,...,...
4993,Paleo,How can I eat paleo when you’re a teen and you...,[removed],1540926854
4994,Paleo,Battle of the Proteins. Taking on the suppleme...,,1540908230
4995,Paleo,"[Food Pic] Roasted herb chicken, chicken gizza...",,1540874039
4996,Paleo,[Question] Gut bacteria / Paleo,"44 yrs old, male, 5’8, 147lbs\n\nLately I’ve b...",1540863894


---
## 4. Saving to CSVs

In [9]:
# Commented out to prevent overwriting the original files

# plant_based.to_csv('../data/plant_based.csv', index=False)
# paleo.to_csv('../data/paleo.csv', index=False)