This script to download submissions and comments from reddit is based on that written by reddit user Watchful1 and can be found from their [Github](https://github.com/Watchful1/Sketchpad/blob/master/postDownloader.py). The code has been re-written to allow the submissions to be saved as a json with only the wanted dimensions and measures and also simplify the code as not all the functionality is needed.

In [1]:
import requests
from datetime import datetime
import traceback
import time
import json
import sys

In [6]:
def downloadFromUrl(subreddit, filename):
    print(f"Saving submissions from r/{subreddit}")

    start_time = datetime.utcnow()
    count = 0
    submissions = {}
    url = "https://api.pushshift.io/reddit/submission/search?limit=1000&sort=desc&subreddit={}&before="
    previous_epoch = int(start_time.timestamp()) # Current time
    while count <= 40000:
        new_url = url.format(subreddit)+str(previous_epoch)
        json_text = requests.get(new_url)
        time.sleep(1)  # pushshift has a rate limit, if we send requests too fast it will start returning error messages
        try:
            json_data = json_text.json()
        except json.decoder.JSONDecodeError:
            time.sleep(1)
            continue

        if 'data' not in json_data:
            break
        objects = json_data['data']
        if len(objects) == 0:
            break
        
        for object in objects:
            previous_epoch = object['created_utc'] - 1
            count += 1 
            sub = {}
            if object['is_self']:
                if 'selftext' not in object:
                    continue
                try:
                    sub['title'] = str(object['title'])
                    sub['subreddit'] = str(object['subreddit'])
                    sub['score'] = str(object['score'])
                    sub['num_comments'] = str(object['num_comments'])
                    sub['created_utc'] = str(object['created_utc'])
                    sub['selftext'] = object['selftext']
                except Exception as err:
                    print(f"Couldn't print post: {object['url']}")
                    print(traceback.format_exc())
                submissions[count] = sub

        print("Saved {} submissions through {}".format(count, datetime.fromtimestamp(previous_epoch).strftime("%Y-%m-%d")))

    print(f"Saved {count} submissions")
    with open(filename, 'w') as fp:
        json.dump(submissions, fp)

In [7]:
downloadFromUrl("Python", "python_posts.json")

Saving submissions from r/Python
Saved 100 submissions through 2021-02-03
Saved 200 submissions through 2021-02-01
Saved 300 submissions through 2021-01-31
Saved 400 submissions through 2021-01-30
Saved 500 submissions through 2021-01-29
Saved 600 submissions through 2021-01-27
Saved 700 submissions through 2021-01-25
Saved 800 submissions through 2021-01-23
Saved 900 submissions through 2021-01-21
Saved 1000 submissions through 2021-01-20
Saved 1100 submissions through 2021-01-19
Saved 1200 submissions through 2021-01-17
Saved 1300 submissions through 2021-01-15
Saved 1400 submissions through 2021-01-13
Saved 1500 submissions through 2021-01-12
Saved 1600 submissions through 2021-01-10
Saved 1700 submissions through 2021-01-09
Saved 1800 submissions through 2021-01-07
Saved 1900 submissions through 2021-01-06
Saved 2000 submissions through 2021-01-04
Saved 2100 submissions through 2021-01-03
Saved 2200 submissions through 2021-01-02
Saved 2300 submissions through 2020-12-31
Saved 2400

Saved 19400 submissions through 2020-06-11
Saved 19500 submissions through 2020-06-10
Saved 19600 submissions through 2020-06-09
Saved 19700 submissions through 2020-06-08
Saved 19800 submissions through 2020-06-08
Saved 19900 submissions through 2020-06-07
Saved 20000 submissions through 2020-06-06
Saved 20100 submissions through 2020-06-05
Saved 20200 submissions through 2020-06-04
Saved 20300 submissions through 2020-06-03
Saved 20400 submissions through 2020-06-02
Saved 20500 submissions through 2020-06-02
Saved 20600 submissions through 2020-06-01
Saved 20700 submissions through 2020-05-31
Saved 20800 submissions through 2020-05-30
Saved 20900 submissions through 2020-05-29
Saved 21000 submissions through 2020-05-28
Saved 21100 submissions through 2020-05-27
Saved 21200 submissions through 2020-05-26
Saved 21300 submissions through 2020-05-25
Saved 21400 submissions through 2020-05-24
Saved 21500 submissions through 2020-05-23
Saved 21600 submissions through 2020-05-23
Saved 21700

Saved 38500 submissions through 2019-12-04
Saved 38600 submissions through 2019-12-03
Saved 38700 submissions through 2019-12-02
Saved 38800 submissions through 2019-11-30
Saved 38900 submissions through 2019-11-28
Saved 39000 submissions through 2019-11-27
Saved 39100 submissions through 2019-11-26
Saved 39200 submissions through 2019-11-24
Saved 39300 submissions through 2019-11-23
Saved 39400 submissions through 2019-11-21
Saved 39500 submissions through 2019-11-20
Saved 39600 submissions through 2019-11-19
Saved 39700 submissions through 2019-11-18
Saved 39800 submissions through 2019-11-17
Saved 39900 submissions through 2019-11-15
Saved 40000 submissions through 2019-11-14
Saved 40100 submissions through 2019-11-12
Saved 40100 submissions
