In [1]:
import json
import requests
import datetime
from time import sleep

In [2]:
endpoint = f'https://old.reddit.com/user/cyclistNerd.json?limit=100'
headers = {
    'User-Agent' : 'cyclistNerd_scrape_self',
}

In [6]:
content = []
num_requests = 1
query_string = ''

while query_string is not None:   
    print(f'({num_requests:2d}) Requesting API ' + (f' with query_string {query_string}' if query_string != '' else 'for most recent content.'))
    response = requests.get(endpoint+query_string, headers=headers).json()
    
    data = response["data"]["children"]
    newest = datetime.datetime.fromtimestamp( data[ 0]['data']['created_utc'] )
    oldest = datetime.datetime.fromtimestamp( data[-1]['data']['created_utc'] )
    print(f"\tReceived {len(data):,d} items from {newest.strftime('%b %d, %Y')} to {oldest.strftime('%b %d, %Y')}.")
    content += data # append to keep ordering correct
    
    after = response['data']['after']
    query_string = f"&after={after}" if after is not None else None
          
    if after is None:
          print(f'Reached end of listing. Retrieved {len(content):,d} total items with {num_requests:,d} requests.')
          
    num_requests += 1
    
    sleep(1)

( 1) Requesting API for most recent content.
	Received 100 items from Apr 09, 2021 to Jul 15, 2020.
( 2) Requesting API  with query_string &after=t1_fy7cwss
	Received 100 items from Jul 15, 2020 to Sep 17, 2019.
( 3) Requesting API  with query_string &after=t1_f0n6zs7
	Received 100 items from Sep 17, 2019 to Jan 06, 2019.
( 4) Requesting API  with query_string &after=t1_edg6zae
	Received 100 items from Jan 06, 2019 to Oct 04, 2018.
( 5) Requesting API  with query_string &after=t1_e777zlb
	Received 100 items from Oct 04, 2018 to Nov 09, 2017.
( 6) Requesting API  with query_string &after=t1_dpkwhi2
	Received 100 items from Nov 05, 2017 to Jul 08, 2017.
( 7) Requesting API  with query_string &after=t1_djxu6wd
	Received 100 items from Jul 08, 2017 to Jun 21, 2017.
( 8) Requesting API  with query_string &after=t1_dj80nwv
	Received 100 items from Jun 13, 2017 to Apr 18, 2017.
( 9) Requesting API  with query_string &after=t1_dgfhx4y
	Received 100 items from Apr 18, 2017 to Mar 24, 2017.
(10)

In [9]:
# check sorting
all(datetime.datetime.fromtimestamp( content[i]['data']['created_utc'] ) >= datetime.datetime.fromtimestamp( content[i+1]['data']['created_utc'] ) for i in range(len(content)-1))


True

In [11]:
# check duplicates
ids = set()

for i in map(lambda i: i['data']['id'], content):
    if i in ids: print(f'Duplicate on {i}')
    else:
        ids.add(i)

# write to file

In [15]:
num_written = 0

with open('history.jsonl', 'w') as f:
    for post in content:
        f.write( json.dumps(post) + '\n' )
        num_written += 1
    
print(f'Finished. Wrote {num_written:,d} to file.')

Finished. Wrote 1,155 to file.
