
# This script downloads all the posts and comments from a specified subreddit





In [1]:
from psaw import PushshiftAPI
import csv
from datetime import datetime
import os

In [2]:
"""
Function to scrape reddit.
"""

def scrape_subreddit(subreddit, file_path='/'):
    
    api = PushshiftAPI()
    year = datetime.now().year
    data_list = [["isodate", "author", "title", "permalink", "text"]]
    
    if file_path[-1] == "/": file_path = file_path[:-1]
    filename = f"{file_path}/{subreddit}.csv"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    while year:
        
        #Set start and end dates for the year
        start_epoch = int(datetime(year, 1, 1).timestamp())
        end_epoch = int(datetime(year, 12, 31).timestamp())

        # Searching subreddit posts
        gen = api.search_submissions(
            subreddit=subreddit,
            after=start_epoch,
            before=end_epoch,
            filter=["author", "title", "permalink", "selftext"]
        )
        
        #This is used to check whether the generator object has anymore output.
        #If it is False, then we can safely end the scraping process
        post_exists = False
        
        # We iterate over each object in the generator.
        for item in gen:
            post_exists = True
            data = item[-1]

            # We extract the values and prevent crashes for non-existing ones.
            timestamp = data["created_utc"]
            author = data.get("author", "")
            title = data.get("title", "")
            permalink = data.get("permalink", "")
            text = data.get("selftext", "")

            if permalink != "":
                permalink = "https://www.reddit.com" + permalink

            # We convert the date from a timestamp to ISO format.
            isodate = f"{datetime.fromtimestamp(timestamp):%F %T}"

            data_list.append([isodate, author, title, permalink, text])

        # We save the data list to a CSV file.
        with open(filename, "w", newline="", encoding="utf-8") as csv_file:
            csv.writer(csv_file).writerows(data_list)
            
        print("Done with subreddit posts. Starting on comments...")
        
        # Searching subreddit comments
        gen = api.search_comments(
            subreddit=subreddit,
            after=start_epoch,
            before=end_epoch,
            filter=["author", "permalink", "body"]
        )
    
        # We iterate over each object in the generator.
        for comment in gen:
            post_exists = True
            data = comment[-1]

            # We extract the values and prevent crashes for non-existing ones.
            timestamp = data["created_utc"]
            author = data.get("author", "")
            title = ""
            permalink = data.get("permalink", "")
            body = data.get("body", "")

            if permalink != "":
                permalink = "https://www.reddit.com" + permalink

            # We convert the date from a timestamp to ISO format.
            isodate = f"{datetime.fromtimestamp(timestamp):%F %T}"

            data_list.append([isodate, author, title, permalink, body])

        with open(filename, "w", newline="", encoding="utf-8") as csv_file:
            csv.writer(csv_file).writerows(data_list)

        if not post_exists:
            break

        print(f'YEAR {year} DONE!!!')
        
        year -= 1

In [4]:
"""
Enter a list of subreddits here
"""
subreddit_list= [
                "nihilism",
                "stoicism",
                "Absurdism",
                "Existentialism",
                "Pessimism"
]

sub = "Nietzsche"

file_path = f"{sub}/"
scrape_subreddit(sub, file_path)


KeyboardInterrupt

