In [1]:
import json
import os
import requests
import numpy as np
import pandas as pd
import bz2


In [2]:
# Generate a list of URLs for the reddit files

url_stub = 'https://files.pushshift.io/reddit/comments/'
file_urls = []

for year in range(2006,2020):
    
    # Set extension by year for most files
    
    if year < 2018:
        extension = '.bz2'
    elif year < 2019:
        extension = '.xz'
    else:
        extension = '.zst'
    
    # Loop over each file and print its name
    for month in range(1,13):
        
        # Handle a few special cases    
        if (year == 2017 and month == 12):
            extension = '.xz'
        if (year == 2018 and month in [11,12]):
            extension = '.zst'

        # Create the file name, adding the leading zero
        # if the month is 1 - 9
        if month < 10:
            file = 'RC_' + str(year) + '-0' + str(month) + extension
        else:
            file = 'RC_' + str(year) + '-' + str(month) + extension
            
        file_urls.append(url_stub + file)

file_urls[0:5]

['https://files.pushshift.io/reddit/comments/RC_2006-01.bz2',
 'https://files.pushshift.io/reddit/comments/RC_2006-02.bz2',
 'https://files.pushshift.io/reddit/comments/RC_2006-03.bz2',
 'https://files.pushshift.io/reddit/comments/RC_2006-04.bz2',
 'https://files.pushshift.io/reddit/comments/RC_2006-05.bz2']

In [3]:
%%script false --no-raise-error

# Files have already been retrieved and saved, so
# don't run this again unless more files are needed

# Retrieve a sample file from each year
# The month is staggered so a different month
# will be chosen for each year

file_path = 'Reddit_Posts\\zipped\\'

# 169 files total, skipping 11 files with each iteration
for i in range(0,169, 11):
    
    response = requests.get(file_urls[i])
    
    # Extract the file name from the URL and generate
    # the save path
    file_download = file_urls[i].split('/')[5]
    file_saved = file_path + file_download
    
    # Write the file (i.e. bytes)
    # This is very slow for large files and may hang
    with open(file_saved, 'wb') as f:
        f.write(response.content)

In [4]:
file_path = 'Reddit_Posts\\zipped\\'
file_name = 'RC_2006-01'

with open(file_path + file_name, 'rt') as f:
    content = f.read().split('\n')


In [15]:
content[3664:3666]

['{"distinguished":null,"retrieved_on":1473820870,"gilded":0,"edited":false,"id":"c165x","parent_id":"t1_c15rk","author":"rah","author_flair_text":null,"score":-7,"ups":-7,"created_utc":1138751580,"author_flair_css_class":null,"subreddit":"reddit.com","subreddit_id":"t5_6","stickied":false,"link_id":"t3_14rn","body":"Err... I think HPK you are closer to being homophobic/gay-bashing than I was.  And yet your score seems alright in comparison to above.\\r\\n\\r\\nWacky wacky reddit! ;-)\\r\\n\\r\\n---\\r\\n\\r\\nAnd--in tonyl\'s defense--some people actually like **peanut butter**.","controversiality":0}',
 '{"subreddit":"reddit.com","author_flair_css_class":null,"created_utc":1138751620,"score":-1,"ups":-1,"controversiality":0,"body":"Here\'s an example of the lack of diversity of opinion. I posted this article which will get modded down to oblivion: http://reddit.com/info?id=165d","link_id":"t3_15n2","stickied":false,"subreddit_id":"t5_6","gilded":0,"retrieved_on":1473820870,"distingui

In [19]:
# Attempting to create a list of JSON objects to load
# into a df
# This cell runs for the first few records, but errors out eventually.
# Reason is there is an empty string at the end after the newline split;
# need to remove this, and then it should work



records = [json.loads(record) for record in content[0:3666]]

In [24]:
records

[{'subreddit': 'reddit.com',
  'author_flair_css_class': None,
  'created_utc': 1136074029,
  'score': 0,
  'ups': 0,
  'body': 'early 2006 a probable date',
  'controversiality': 0,
  'link_id': 't3_22569',
  'stickied': False,
  'subreddit_id': 't5_6',
  'gilded': 0,
  'retrieved_on': 1473821517,
  'distinguished': None,
  'author_flair_text': None,
  'author': 'jh99',
  'parent_id': 't3_22569',
  'edited': False,
  'id': 'c2715'},
 {'id': 'c2717',
  'edited': False,
  'parent_id': 't3_22542',
  'author_flair_text': None,
  'author': 'jpb',
  'retrieved_on': 1473821517,
  'distinguished': None,
  'gilded': 0,
  'link_id': 't3_22542',
  'stickied': False,
  'subreddit_id': 't5_6',
  'controversiality': 0,
  'body': "If you are going to post something that has a link to the original author, why not just post the original instead of someone's copy?",
  'author_flair_css_class': None,
  'created_utc': 1136076410,
  'score': 0,
  'ups': 0,
  'subreddit': 'reddit.com'}]

In [20]:
pd.DataFrame(records)

Unnamed: 0,subreddit,author_flair_css_class,created_utc,score,ups,body,controversiality,link_id,stickied,subreddit_id,gilded,retrieved_on,distinguished,author_flair_text,author,parent_id,edited,id
0,reddit.com,,1136074029,0,0,early 2006 a probable date,0,t3_22569,False,t5_6,0,1473821517,,,jh99,t3_22569,False,c2715
1,reddit.com,,1136076410,0,0,If you are going to post something that has a ...,0,t3_22542,False,t5_6,0,1473821517,,,jpb,t3_22542,False,c2717
2,reddit.com,,1136078623,2,2,Microsoft hates it's own products?\r\nWho knew?,0,t3_22515,False,t5_6,0,1473821517,,,Pichu0102,t3_22515,False,c2718
3,reddit.com,,1136079346,2,2,"this looks interesting, but it's already aired...",0,t3_22528,False,t5_6,0,1473821517,,,libertas,t3_22528,False,c2719
4,reddit.com,,1136081389,0,0,I have nothing but good things to say about De...,0,t3_22538,False,t5_6,0,1473821517,,,mdmurray,t3_22538,False,c2722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3661,reddit.com,,1138751411,4,4,"I find his ""I am a mathematician"" approach kin...",0,t3_15tl,False,t5_6,0,1473820870,,,psykotic,t3_15tl,False,c165u
3662,reddit.com,,1138751512,-5,-5,I totally did not pick up on that distinction....,0,t3_14rn,False,t5_6,0,1473820870,,,rah,t1_c15l3,False,c165v
3663,reddit.com,,1138751531,4,4,"So, having tact filters on both input and outp...",0,t3_15so,False,t5_6,0,1473820870,,,Zarutian,t3_15so,False,c165w
3664,reddit.com,,1138751580,-7,-7,Err... I think HPK you are closer to being hom...,0,t3_14rn,False,t5_6,0,1473820870,,,rah,t1_c15rk,False,c165x
