# Project 3: `Web API's` & `Classification`
---
_Gabriel Perez Prieto_  

### 01_Data Collection and API's - Reddit
---

#### Import Libraries

In [1]:
import requests
import pandas as pd
import time
from datetime import datetime

In [2]:
# Set max_columns display to 500
pd.set_option('display.max_columns', 500)

#### Request Posts from Subreddits

In [3]:
# Create function to scrape subreddits
def scrape_reddit(*args, posts=500):
    ''' 
    Scrapes Reddit's Subreddits Using PushShift's API
        
    Inputs = subreddits and number of posts to be requested (Multiple of 500!)
    Output = Pandas DataFrame
        
    Fields Returned = author, title, num_comments, link_flair_text, url
    '''
    
    # StartTime
    start  = time.time()
    
    # Create and empty dictionary to house data
    data_compiled = {}
    
    # Loop through function's inputs
    for subreddit in args:

        # Loop through _ times - 500/request limit 
        for i in range(0, int(posts / 500)):
            
            # Set condition to collect posts before previously collected posts
            if i == 0:
                last_post_time = ''
            else:
                before = last_post_time

            # Print Initializing
            print(f'Scraping {subreddit}...')

            # Set max requested posts at a time - API rules!
            rows_at_a_time = int(posts / (posts / 500))
            
            # Select reddit's parameters
            base_url = 'https://api.pushshift.io/reddit/search/comment/' # or submissions for titles
            search_term = '?q=' + '' + '&'
            sub_reddit = 'subreddit=' + subreddit + '&'
            fields = 'fields=created_utc,author,title,body,num_comments,url,score' + '&' #title for submissions
            sort_type = 'num_comments' + '&'
            sort = 'sort=desc' + '&'
            size = 'size=' + str(rows_at_a_time) + '&'
            before = 'before=' + str(last_post_time) + '&'               

            # Set url for scraping
            url = base_url + search_term + sub_reddit  + sort_type + sort + size + before + fields
         
            # Print JSON's url for each request
            print(f'JSON {subreddit}: {url}')

            # Create a request
            res = requests.get(url)

            # Request's status_code
            status_code = res.status_code

            # Print status code of subreddit being scraped
            print(f'Request Status: {status_code}')

            # Error if status code is different from 200
            if status_code != 200:
                print(f'Error Occurred, Request Status: {status_code}')
                break

            else:
                # Create data from JSON
                data = res.json()['data']

                # Append data from different subreddit scraped
                data_compiled[str(subreddit)+str(i)] = data

                # Print number of rows requested
                print(f'{rows_at_a_time} Row(s) Scraped from: {subreddit}\n')

                # Set last_post time to request posts before this one on second loop
                last_post_time = data_compiled[str(subreddit) + str(int(i))][rows_at_a_time - 1]['created_utc']
                
                # Wait 10 senconds for the next request
                time.sleep(10)
                
    # Create empty pandas DataFrame
    df = pd.DataFrame()

    # Loop through dictionary with list of dictionaries - Keys = subreddits, Values = data
    for key in data_compiled.keys():

        # Create pandas DataFrame for requested data
        df_subreddit = pd.DataFrame(data_compiled[key])
        df_subreddit['subreddit'] = key

        # Concatenate DataFrames
        df = pd.concat([df, df_subreddit], axis=0, ignore_index=True)
    
    # Convert utc to datetime and create a new column with dates
    df['created_date'] = df['created_utc'].map(lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Drop created_utc column - not necessary
    df.drop(columns=['created_utc'], axis=1, inplace=True)
    
    # Reset subreddit columns to subreddit name - included i valuess
    df['subreddit'] = df['subreddit'].map(lambda x: x[:len(x) - len(str(int(i)))])
    
    # Print total runtime including 10 sec buffer in between requests
    print(f'Total RunTime: {time.time() - start}')
    
    # Return final DataFrame
    return df

#### Run scrape_reddit Function and Save as DataFrame

In [4]:
df = scrape_reddit('vegan', 'vegetarian', posts=5000)

Scraping vegan...
JSON vegan: https://api.pushshift.io/reddit/search/comment/?q=&subreddit=vegan&num_comments&sort=desc&size=500&before=&fields=created_utc,author,title,body,num_comments,url,score&
Request Status: 200
500 Row(s) Scraped from: vegan

Scraping vegan...
JSON vegan: https://api.pushshift.io/reddit/search/comment/?q=&subreddit=vegan&num_comments&sort=desc&size=500&before=1571328366&fields=created_utc,author,title,body,num_comments,url,score&
Request Status: 200
500 Row(s) Scraped from: vegan

Scraping vegan...
JSON vegan: https://api.pushshift.io/reddit/search/comment/?q=&subreddit=vegan&num_comments&sort=desc&size=500&before=1571310032&fields=created_utc,author,title,body,num_comments,url,score&
Request Status: 200
500 Row(s) Scraped from: vegan

Scraping vegan...
JSON vegan: https://api.pushshift.io/reddit/search/comment/?q=&subreddit=vegan&num_comments&sort=desc&size=500&before=1571281225&fields=created_utc,author,title,body,num_comments,url,score&
Request Status: 200
50

#### Check First 5 Rows - Make Sure Data Was Pulled

In [5]:
df.head()

Unnamed: 0,author,body,score,subreddit,created_date
0,ThisIsMyRental,"My, my, my, you've outdone yourself friend! It...",1,vegan,2019-10-17 20:01:40
1,NorthernTurnip,Yeah that drives me crazy,1,vegan,2019-10-17 20:01:32
2,SkarKrow,London is amazing.\n\nThough if you visit the ...,1,vegan,2019-10-17 20:00:57
3,Breaking-finch,Because you took their land from them and disp...,0,vegan,2019-10-17 19:59:50
4,beariesad,"ohh no shame in it at all! it's good for you, ...",2,vegan,2019-10-17 19:59:21


#### Check Shape and Value Counts for each Subreddit

In [6]:
df.shape

(10000, 5)

In [7]:
df['subreddit'].value_counts()

vegetarian    5000
vegan         5000
Name: subreddit, dtype: int64

In [8]:
# Save data as .csv
df.to_csv('./data/reddit.csv', index_label=False)