In [3]:
import json
import os
import requests
import numpy as np
import pandas as pd
import bz2


In [4]:
# Generate a list of URLs for the reddit files

url_stub = 'https://files.pushshift.io/reddit/comments/'
file_urls = []

for year in range(2006,2020):
    
    # Set extension by year for most files
    
    if year < 2018:
        extension = '.bz2'
    elif year < 2019:
        extension = '.xz'
    else:
        extension = '.zst'
    
    # Loop over each file and print its name
    for month in range(1,13):
        
        # Handle a few special cases    
        if (year == 2017 and month == 12):
            extension = '.xz'
        if (year == 2018 and month in [11,12]):
            extension = '.zst'

        # Create the file name, adding the leading zero
        # if the month is 1 - 9
        if month < 10:
            file = 'RC_' + str(year) + '-0' + str(month) + extension
        else:
            file = 'RC_' + str(year) + '-' + str(month) + extension
            
        file_urls.append(url_stub + file)

In [5]:
%%script false --no-raise-error

# Files have already been retrieved and saved

# Retrieve a sample file from each year
# The month is staggered so a different month
# will be chosen for each year

for i in range(0,169, 11):
    
    response = requests.get(file_urls[i])
    
    # Extract the file name from the URL and generate
    # the save path
    file_download = file_urls[i].split('/')[5]
    file_saved = 'Reddit_Posts\\zipped\\' + file_download
    
    # Write the file (i.e. bytes)
    # This is very slow for large files and may hang
    with open(file_saved, 'wb') as f:
        f.write(response.content)

In [8]:

file_compressed = 'Reddit_Posts\\zipped\\RC_2006-01.bz2'

with bz2.open(file_compressed,'rb') as f:
    content = f.read()



In [10]:
print(len(content), type(content), sep='\n')

2106991
<class 'bytes'>


In [11]:
print(dir(content))

['__add__', '__class__', '__contains__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', 'capitalize', 'center', 'count', 'decode', 'endswith', 'expandtabs', 'find', 'fromhex', 'hex', 'index', 'isalnum', 'isalpha', 'isascii', 'isdigit', 'islower', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']


In [16]:
content[0:600]

b'{"subreddit":"reddit.com","author_flair_css_class":null,"created_utc":1136074029,"score":0,"ups":0,"body":"early 2006 a probable date","controversiality":0,"link_id":"t3_22569","stickied":false,"subreddit_id":"t5_6","gilded":0,"retrieved_on":1473821517,"distinguished":null,"author_flair_text":null,"author":"jh99","parent_id":"t3_22569","edited":false,"id":"c2715"}\n{"id":"c2717","edited":false,"parent_id":"t3_22542","author_flair_text":null,"author":"jpb","retrieved_on":1473821517,"distinguished":null,"gilded":0,"link_id":"t3_22542","stickied":false,"subreddit_id":"t5_6","controversiality":0,"b'

In [33]:
content_json = []
record = ''

for i in range(0, 1000):
    char = content.decode()[i]
    
    if char != '}':
        record.join(char)
        content_json.append(record)
    elif char == '}':
        record.join(char)
        content_json.append(record)
        break


TypeError: join() takes exactly one argument (2 given)

In [30]:
print(content_json)

['{', '"', 's', 'u', 'b', 'r', 'e', 'd', 'd', 'i', 't', '"', ':', '"', 'r', 'e', 'd', 'd', 'i', 't', '.', 'c', 'o', 'm', '"', ',', '"', 'a', 'u', 't', 'h', 'o', 'r', '_', 'f', 'l', 'a', 'i', 'r', '_', 'c', 's', 's', '_', 'c', 'l', 'a', 's', 's', '"', ':', 'n', 'u', 'l', 'l', ',', '"', 'c', 'r', 'e', 'a', 't', 'e', 'd', '_', 'u', 't', 'c', '"', ':', '1', '1', '3', '6', '0', '7', '4', '0', '2', '9', ',', '"', 's', 'c', 'o', 'r', 'e', '"', ':', '0', ',', '"', 'u', 'p', 's', '"', ':', '0', ',', '"', 'b', 'o', 'd', 'y', '"', ':', '"', 'e', 'a', 'r', 'l', 'y', ' ', '2', '0', '0', '6', ' ', 'a', ' ', 'p', 'r', 'o', 'b', 'a', 'b', 'l', 'e', ' ', 'd', 'a', 't', 'e', '"', ',', '"', 'c', 'o', 'n', 't', 'r', 'o', 'v', 'e', 'r', 's', 'i', 'a', 'l', 'i', 't', 'y', '"', ':', '0', ',', '"', 'l', 'i', 'n', 'k', '_', 'i', 'd', '"', ':', '"', 't', '3', '_', '2', '2', '5', '6', '9', '"', ',', '"', 's', 't', 'i', 'c', 'k', 'i', 'e', 'd', '"', ':', 'f', 'a', 'l', 's', 'e', ',', '"', 's', 'u', 'b', 'r', 'e',