# Download all comments and posts made by authors of a subreddit

In [None]:
import praw 
import pandas as pd
import tqdm
import math
import json
import requests
import itertools
import numpy as np
import time
from datetime import datetime, timedelta

In [None]:
#Way too slow. Didn't have to use this wrapper. Better use Pushshift.io directly to extract authors comments and posts
# # Authentication 
# reddit = praw.Reddit(client_id='RLHw-HpxYJqUR7UJLrBQ1Q',
#                      client_secret='ytB8OnsUKW68-k6ydmmv-RyX3mq_Mw',
#                      user_agent='loa_crawler',
#                      username='',
#                      password='')

In [None]:
"""
Function to make request to Pushshift
"""
def make_request(uri, max_retries = 5):
    def fire_away(uri):
        response = requests.get(uri)
        assert response.status_code == 200
        return json.loads(response.content)
    current_tries = 1
    while current_tries < max_retries:
        try:
            time.sleep(1)
            response = fire_away(uri)
            return response
        except:
            time.sleep(1)
            current_tries += 1
    return fire_away(uri)


"""
Load and process posts and comments from a subreddit
"""
def get_processed_df(file_path):
    
    df_subreddit = pd.read_csv(f"{file_path}.csv")
    df_subreddit = df_subreddit[df_subreddit['author']!="[deleted]"] 
    df_subreddit = df_subreddit[df_subreddit['text']!="[deleted]"] 
    df_subreddit = df_subreddit[df_subreddit['text']!="[removed]"]
    df_subreddit["text"].fillna(" ", inplace = True)
    df_subreddit = df_subreddit.drop_duplicates()
    
    return df_subreddit

"""
Get list of unique authors posting at a given subreddit
"""
def get_unique_authors(df_subreddit):
    
    # Only take users who posted more than once
    repeating = df_subreddit[df_subreddit.duplicated(['author'], keep = False)] 
    # Get rid of deleted users
    repeating = repeating[repeating.author != 'None'] 
    u_authors = list(repeating.author.unique()) 
    
    print("Number of unique authors :", len(u_authors))
    return u_authors


"""
Download all comments made anywhere by a given list of authors
"""
def download_author_comments(u_authors, file_path):
    authors_list = []

    for u in tqdm.tqdm(range(len(u_authors))): 

        uri = f"https://api.pushshift.io/reddit/comment/search/?author={u_authors[u]}&filter=created_utc,author,subreddit,permalink,body"
        try:
            result = make_request(uri, 
                          max_retries= 2)
            authors_list.extend(result['data'])
        except Exception as e:
            print("ERROR :", e)
            pass

    authors_df = pd.DataFrame(authors_list)
    authors_df['isodate'] = authors_df['created_utc'].apply(lambda x: f"{datetime.fromtimestamp(x):%F %T}")
    authors_df.drop(columns = ['created_utc'], inplace = True)
    
    authors_df.to_csv(f'{file_path}_comments.csv', index = False)
    
    return authors_df
    
    
"""
Download all posts made anywhere by a given list of authors
"""   
def download_author_posts(u_authors, file_path):
    authors_posts = []

    for u in tqdm.tqdm(range(len(u_authors))):

        uri = f"https://api.pushshift.io/reddit/submission/search/?author={u_authors[u]}&filter=created_utc,author,subreddit,permalink,title,selftext"
        try:
            result = make_request(uri, 
                          max_retries= 2)
            authors_posts.extend(result['data'])
        except Exception as e:
            print("ERROR :", e)
            pass

    authors_df = pd.DataFrame(authors_posts)
    authors_df['isodate'] = authors_df['created_utc'].apply(lambda x: f"{datetime.fromtimestamp(x):%F %T}")
    authors_df.drop(columns = ['created_utc'], inplace = True)
    
    authors_df.to_csv(f'{file_path}_posts.csv', index = False)
    
    return authors_df

In [None]:
"""
Get all posts from a particular subreddit
Set subreddit here
"""

subreddit = 'Nietzsche'
file_path = f"{subreddit}/{subreddit}"

df_subreddit = get_processed_df(file_path)
u_authors = get_unique_authors(df_subreddit)

print(df_subreddit.shape)
df_subreddit.head()

In [None]:
authors_posts = download_author_posts(u_authors, file_path)
print(authors_posts.shape)
authors_posts.head()

In [None]:
authors_comments = download_author_comments(u_authors, file_path)
print(authors_comments.shape)
authors_comments.head()