This script to download submissions and comments from reddit is copied from reddit user Watchful1 and can be found from their [Github](https://github.com/Watchful1/Sketchpad/blob/master/postDownloader.py). The code has been restructured to write the file in a way that is readable to pandas.

In [1]:
import requests
from datetime import datetime
import traceback
import time
import json
import sys

In [2]:
username = ""  # put the username you want to download in the quotes
subreddit = "Python"  # put the subreddit you want to download in the quotes
# leave either one blank to download an entire user's or subreddit's history
# or fill in both to download a specific users history from a specific subreddit

In [3]:
filter_string = None
if username == "" and subreddit == "":
    print("Fill in either username or subreddit")
    sys.exit(0)
elif username == "" and subreddit != "":
    filter_string = f"subreddit={subreddit}"
elif username != "" and subreddit == "":
    filter_string = f"author={username}"
else:
    filter_string = f"author={username}&subreddit={subreddit}"

url = "https://api.pushshift.io/reddit/{}/search?limit=1000&sort=desc&{}&before="

start_time = datetime.utcnow()

In [26]:
def downloadFromUrl(filename, object_type):
    print(f"Saving {object_type}s to {filename}")

    count = 0
    handle = open(filename, 'w')
    previous_epoch = int(start_time.timestamp()) # Current time
    while count <= 1:
        new_url = url.format(object_type, filter_string)+str(previous_epoch)
        json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
        time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages
        try:
            json_data = json_text.json()
        except json.decoder.JSONDecodeError:
            time.sleep(1)
            continue

        if 'data' not in json_data:
            break
        objects = json_data['data']
        if len(objects) == 0:
            break
        
        for object in objects:
            previous_epoch = object['created_utc'] - 1
            count += 1 
            if object['is_self']:
                if 'selftext' not in object:
                    continue
                try:
                    # re-written to structure in a way that is easily read by pandas
                    handle.write(str(object['title']))
                    handle.write("|||")
                    handle.write(str(object['subreddit']))
                    handle.write("|||")
                    handle.write(str(object['score']))
                    handle.write("|||")
                    handle.write(str(object['num_comments']))
                    handle.write("|||")
                    handle.write(str(object['created_utc']))
                    handle.write("|||")
                    handle.write(object['selftext'].encode(encoding='ascii', errors='ignore').decode())
                    handle.write("|||")
                except Exception as err:
                    print(f"Couldn't print post: {object['url']}")
                    print(traceback.format_exc())

        print("Saved {} {}s through {}".format(count, object_type, datetime.fromtimestamp(previous_epoch).strftime("%Y-%m-%d")))

    print(f"Saved {count} {object_type}s")
    handle.close()

In [27]:
downloadFromUrl("python_posts.txt", "submission")

Saving submissions to python_posts.txt
Saved 100 submissions through 2021-01-29
Saved 200 submissions through 2021-01-28
Saved 300 submissions through 2021-01-26
Saved 400 submissions through 2021-01-24
Saved 500 submissions through 2021-01-23
Saved 600 submissions through 2021-01-21
Saved 700 submissions through 2021-01-20
Saved 800 submissions through 2021-01-18
Saved 900 submissions through 2021-01-17
Saved 1000 submissions through 2021-01-15
Saved 1100 submissions through 2021-01-13
Saved 1200 submissions through 2021-01-12
Saved 1300 submissions through 2021-01-10
Saved 1400 submissions through 2021-01-08
Saved 1500 submissions through 2021-01-07
Saved 1600 submissions through 2021-01-06
Saved 1700 submissions through 2021-01-04
Saved 1800 submissions through 2021-01-03
Saved 1900 submissions through 2021-01-01
Saved 2000 submissions through 2020-12-30
Saved 2100 submissions through 2020-12-29
Saved 2200 submissions through 2020-12-27
Saved 2300 submissions through 2020-12-25
Save