In [19]:
import bz2
import urllib.request
import os
import csv
import json
import datetime
import traceback

In [20]:
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]

# prompt user for information
startyear = input('Start year: ')
endyear = input('End year: ')
text = input('Posts or Comments? (p/c): ').lower()
allUserPosts = input('User Posts from All of Reddit? (y/n): ').lower()

Start year: 2014
End year: 2014
Posts or Comments? (p/c): p
User Posts from All of Reddit? (y/n): n


In [21]:
if text == 'p':
    file = 'fields.csv'
elif text == 'c':
    file = 'commentFields.csv'
else:
    print('Error, invalid Posts/Comments input')
    exit(1)

if (allUserPosts != 'y') and (allUserPosts != 'n'):
    print('Error, invalid User Posts input')


In [22]:
subreddits =[]
try:
    subreddits_file = open('subreddits.csv', 'r')
except:
    print('Error: Could not open subreddits.csv')
    exit(2)
    
subreddit_reader = csv.reader(subreddits_file,delimiter=',', quotechar='\"')
for subreddits_list in subreddit_reader:
    subreddits = subreddits_list
    
subreddits
    

['SuicideWatch', 'ptsd', 'depression']

In [23]:
# open 'fields.csv' or 'commentFields.csv' and read as a csv, otherwise return an error
try:
    fields_file = open(file, 'r')
except:
    print('Error: Could not open fields file')
    exit(2)

# if fields.csv exists, enter
fields_reader = csv.reader(fields_file, delimiter=',', quotechar='\"')
for words in fields_reader:
    fields = words
    
fields

['title',
 'created_utc',
 'author',
 'ups',
 'downs',
 'num_comments',
 'id',
 'name',
 'from',
 'from_id',
 'selftext',
 'subreddit',
 'score',
 'url',
 'permalink']

In [24]:
# create a log file to write program output
with open("log.txt", "a") as f:
    f.write("NEW RUN: Time: " + str(datetime.datetime.now()) + ", Start Year: " + startyear + ", End Year: " + endyear + ", Text: " + text + ", All User Posts: " + allUserPosts + "\n")

In [16]:
# define a function that will read a JSONLines file with fields from fields.txt
def parsejson(infile, suffix, subreddits, fields):
    # open csv output files, with the given suffix
    csvfiles = []
    writers  = []
    users = []
    for i in range(len(subreddits)):
        csvfile = open(subreddits[i] + "_" + suffix, 'w', encoding = 'utf-8')
        writer = csv.writer(csvfile, delimiter=',', quotechar='\"', quoting=csv.QUOTE_MINIMAL)
        csvfiles.append(csvfile)
        writers.append(writer)
        users.append([])
        
    # check to see if json has key, if not, return empty string
    def getValue(key, submission):
        if key in submission:
            return submission[key]
        return ""

    # open jsonlines file, iterate through each line, which is a json object,
    with open(infile, encoding = 'utf-8', errors = 'ignore') as f:
        # write a row of headings
        for writer in writers:            
            writer.writerow(fields)
        
    
        lines_read = 0
        for line in f:
            # try to load an object from line, otherwise print error and move on
                # load object in current line as submission
            submission = (json.loads(line))
            # if object is of desired subreddit, then load its contents into csv
            # if the user required all user posts, find all users within a particular subreddit, with no duplicates
            if allUserPosts == 'y':
                for i in range(len(subreddits)):
                    if getValue('subreddit', submission) == subreddits[i]:
                        if not (getValue('author', submission) in users[i]):
                            users[i].append(getValue('author', submission))
                # write all posts by those particular users
                for i in range(len(subreddits)):
                    if getValue('author', submission) in users[i]:
                        writers[i].writerow(list(map(lambda field: getValue(field,submission),fields)))
            # else, if object is of desired subreddit, then load its contents into csv
            elif allUserPosts == 'n':
                for i in range(len(subreddits)):
                    if getValue('subreddit', submission) == subreddits[i]:
                        print(subreddits[i])
                        writers[i].writerow(list(map(lambda field: getValue(field,submission),fields)))
            lines_read += 1
            if lines_read % 10000 == 0:
                print(lines_read)
    # close files
    for csvfile in csvfiles:
        csvfile.close()

In [18]:
# repeat over how many years user desires
for year in range(int(startyear), int(endyear) + 1):
    for month in months:
        # create file name from year according to download format from https://files.pushshift.io
        if text == 'p':
            file_name = "RS_" + str(year) + "-" + month + ".bz2"
            print(file_name)
            # create file path string to download file
            file_loc = "E:\\REU\\RedditData\\Submissions\\" + file_name
            print(file_loc)
        elif text == 'c':
            file_name = "RC_" + str(year) + "-" + month + ".bz2"
            # create file path string to download file
            file_url = "E:\\REU\\RedditData\\Comments\\" + file_name
        # create output file in format year-month.csv
        if allUserPosts == 'y':
            csvfile = str(year) + "-" + month + "-" + text + "-U" + ".csv"
            #print(csvfile)
        else:
            csvfile = str(year) + "-" + month + "-" + text + ".csv"
            print(csvfile)
            


        # try to decompress and write the data into a temp file, if it exists(some data from 2006 and 2007 is missing)
        try:
            # decompress data
            try:
                
                print("OPENING FILE: " + file_loc )
                bz2file = open(file_loc, "rb")
                print(file_loc + " IS OPEN")
                print("DECOMPRESSING FILE: " + file_loc)
                data = bz2.decompress(bz2file.read())
                print(file_loc + " IS DECOMPRESSED")
                print(len(data))
            except:
                with open("log.txt", "a") as f:
                    f.write("Reddit data for the month: " + month + " and year: " + str(year) + " is missing." + "\n")
            # write into "temp" file which is overwritten for each month to save memory
            with open("temp", "wb") as code:
                print("WRITING DATA")
                code.write(data)
                print("DATA WRITTEN")
            # call parsejson function to go through the decompressed file and select the desired fields from the desired subreddit
            print("CALLING PARSE JSON")
            parsejson("temp", csvfile, subreddits, fields)
            print("FINISHED PARSE JSON")
            # if data is missing, print year and month of missing data
        except Exception as e:
            print("SOMETHING WENT WRONG")
            traceback.print_exc()
            print(e)
            with open("log.txt", "a") as f:
                f.write("Error processing Reddit data for the month: " + month + " and year: " + str(year) + "\n")

# remove temp file
if os.path.isfile("temp"):
    os.remove("temp")
# write lines to log file to separate runs
with open("log.txt", "a") as f:
    f.write("\n\n\n")

RS_2010-01.bz2
E:\REU\RedditData\Submissions\RS_2010-01.bz2
2010-01-p.csv
OPENING FILE: E:\REU\RedditData\Submissions\RS_2010-01.bz2
E:\REU\RedditData\Submissions\RS_2010-01.bz2 IS OPEN
DECOMPRESSING FILE: E:\REU\RedditData\Submissions\RS_2010-01.bz2
E:\REU\RedditData\Submissions\RS_2010-01.bz2 IS DECOMPRESSED
<class 'bytes'>
572293900
WRITING DATA
DATA WRITTEN
CALLING PARSE JSON
parsejson RECEIVED CALL
ARRAYS INITIALIZED WRITING FIELDS
FIELDS WRITTEN
OPENING infile temp1
ABOUT TO GO THROUGH EACH LINE
SuicideWatchTRUE TRUE TRUE TRUE TRUE
1000
2000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
3000
4000
5000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
6000
7000
depressionTRUE TRUE TRUE TRUE TRUE
8000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
9000
depressionTRUE TRUE TRUE TRUE TRUE
10000
11000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
SuicideWatchTRUE TRUE TRUE TRUE TRUE
12000
13000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
14000
15000
16000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
depressionTRUE TRUE TRUE TRUE TRUE
17000

357000
358000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
359000
depressionTRUE TRUE TRUE TRUE TRUE
360000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
361000
362000
363000
364000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
SuicideWatchTRUE TRUE TRUE TRUE TRUE
365000
366000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
367000
368000
369000
depressionTRUE TRUE TRUE TRUE TRUE
370000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
371000
depressionTRUE TRUE TRUE TRUE TRUE
372000
373000
374000
depressionTRUE TRUE TRUE TRUE TRUE
375000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
376000
377000
378000
depressionTRUE TRUE TRUE TRUE TRUE
SuicideWatchTRUE TRUE TRUE TRUE TRUE
379000
380000
381000
382000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
383000
depressionTRUE TRUE TRUE TRUE TRUE
384000
385000
depressionTRUE TRUE TRUE TRUE TRUE
386000
387000
388000
389000
390000
391000
392000
393000
394000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
395000
396000
397000
398000
399000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
400000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
dep

119000
120000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
121000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
SuicideWatchTRUE TRUE TRUE TRUE TRUE
SuicideWatchTRUE TRUE TRUE TRUE TRUE
122000
123000
124000
depressionTRUE TRUE TRUE TRUE TRUE
125000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
126000
127000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
128000
129000
130000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
131000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
132000
133000
134000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
135000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
136000
137000
138000
depressionTRUE TRUE TRUE TRUE TRUE
SuicideWatchTRUE TRUE TRUE TRUE TRUE
139000
140000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
depressionTRUE TRUE TRUE TRUE TRUE
152000
153000
154000
155000
156000
157000
158000
159000
160000
depressionTRUE TRUE TRUE TRUE TRUE
161000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
162000
163000
164000
165000
SuicideWatchT

depressionTRUE TRUE TRUE TRUE TRUE
507000
508000
509000
510000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
511000
SuicideWatchTRUE TRUE TRUE TRUE TRUE
FINISHED PARSE JSON
