# Identify and Collect Data:

### Gathering and Classifying r/conspiracytheories vs r/science:

The below lines import the necessary libraries, define the Pushshift API URL, gather Reddit posts in 1000-document iterations as dataframes, and write the gathered dataframes to a CSV.

In [14]:
# Import Libraries:

import requests
import pandas as pd
import time

## Pull Submissions:

In [19]:
# Set up API URL:

base_url = 'https://api.pushshift.io/reddit/search/submission?subreddit='

In [22]:
# Define function to gather 1000 posts from desired Reddit subthread and write to CSV:

# Pass parameters for subreddit thread name and number of 1k iterations to perform:
def get_submissions(subreddit, n_iter):
    # Instantiate dataframe list:
    df_list = []
    # Set current time as reference:
    current_time = 1587081440
    
    for _ in range(n_iter):
        res = requests.get(
            base_url,
            # 'Before' uses above 'current_time' to take 
            # the minimum time from each pull and then 
            # retrieve another 1000 posts back per iteration:
            params={
                "subreddit": subreddit,
                "size": 1000,
                "lang": True,
                "before": current_time
            })
        df = pd.DataFrame(res.json()['data'])
        df = df.loc[:, ['created_utc', 'subreddit','selftext','title']]
        df_list.append(df)
        current_time = df.created_utc.min()
    # Write csv to data folder, match subreddit thread to csv title:    
    return pd.concat(df_list, axis=0).to_csv('../data/science_pull_submissions.csv')

# Code adapted from GA DSI Office Hours with Tim Book.

In [23]:
# Run function defining subreddit thread name and the number of 1000-document iterations to perform:

get_submissions('science',12)

## Pull Comments:

In [3]:
# Set up API URL:

base_url = 'https://api.pushshift.io/reddit/search/comment?subreddit='

In [12]:
# Define function to gather 1000 posts from desired Reddit subthread and write to CSV:

# Pass parameters for subreddit thread name and number of 1k iterations to perform:
def get_comments(subreddit, n_iter):
    # Instantiate dataframe list:
    df_list = []
    # Set current time as reference:
    current_time = 1587081440
    
    for _ in range(n_iter):
        res = requests.get(
            base_url,
            # 'Before' uses above 'current_time' to take
            # the minimum time from each pull and then 
            # retrieve another 1000 posts back per iteration:
            params={
                "subreddit": subreddit,
                "size": 1000,
                "lang": True,
                "before": current_time
            })
        df = pd.DataFrame(res.json()['data'])
        df = df.loc[:, ['created_utc', 'subreddit','body']]
        df_list.append(df)
        current_time = df.created_utc.min()
        time.sleep(5) # time delay in seconds
    # Write csv to data folder, match subreddit thread to csv title:    
    return pd.concat(df_list, axis=0).to_csv('../data/science_pull_comments.csv')

# Code adapted from GA DSI Office Hours with Tim Book.

In [13]:
# Run function defining subreddit thread name and the number of 1000-document iterations to perform:

get_comments('science',25)