## Subreddit Download Notebook

This notebook contains code to download subreddits from http://files.pushshift.io/reddit/subreddits/

After dowloading the files in order to decrompress the zst file:  

git clone https://github.com/facebook/zstd.git  
make  
zstd -xvf Reddit_Subreddits.ndjson.zst  


more info = https://github.com/pushshift/api

In [None]:
from urllib import request
from bs4 import BeautifulSoup
import os
from pathlib import Path
import sys
import pandas as pd
import ndjson
import json
import bz2
from io import StringIO

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
else:
    data_path = Path(project_path, 'dataset')

utils_path = str(Path(project_path, 'utils'))
# including the project folder and the utils folder
if utils_path not in ''.join(sys.path):
    sys.path.extend([str(project_path), utils_path])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('')
print('sys.path = {0}'.format(sys.path))

In [None]:
# base10 id, reddit base36 id, creation epoch, subreddit name, number of subscribers
basic = pd.read_csv(Path(data_path, 'subreddits_basic.csv')) 
basic.head()

In [None]:
file_object = Path(data_path, 'Reddit_Subreddits.ndjson').open().read()
data = ndjson.loads(file_object)
data[2]

In [None]:
data = pd.read_csv(Path(data_path, '69M_reddit_accounts.csv')) 
data.head()

In [None]:
n_data_path = Path(data_path, 'comments')
data = pd.read_json(Path(n_data_path, 'RC_2005-12'), lines=True) 
print(data.keys())
data.head()

In [None]:
n_data_path = Path(data_path, 'submissions')
data = pd.read_json(Path(n_data_path, 'RC_2005-12'), lines=True) 
print(data.keys())
data.head()

In [None]:
n_data_path = Path(data_path, 'subreddits')
data = pd.read_json(Path(n_data_path, 'subreddits.json'), lines=True) 
print(data.keys())
data.head()

In [None]:
error_files = list()

In [None]:
comments = list()
file_processed_comments = list()

# obtain the body within those comments files
comments_path = Path(data_path, 'comments')

total_elements = 0

for file in comments_path.iterdir():
    if file.suffix == '.json' and file.stem not in file_processed_comments:
        # open file and get data
        data = pd.read_json(file, lines=True)
        comments.extend(data['body'].values)
        total_elements += data['body'].shape[0]
        print('processed {0}'.format(file))
        file_processed_comments.append(file.stem)

In [None]:
# decompreses the bz2 files
comments_path = Path(data_path, 'submissions')
# get all the json files and their stem
processed_files = [element.replace('.csv', '') for element in os.listdir(comments_path) if '.csv' in element]

for file in comments_path.iterdir():
#     if (file.suffix == '.bz2' or file.suffix == '') and file.stem not in processed_files 
    if (file.suffix == '') and file.stem not in processed_files and not file.is_dir():
        try:
            print('processing {0}'.format(file))
            # open file
            if file.suffix == '.bz2':
                zipfile = bz2.BZ2File(file)
                # get the decompressed data
                data = zipfile.read()
                # convert to string
                s = str(data,'utf-8')
                ndata = StringIO(s)
            elif file.suffix == '':
                ndata = file
            # convert json to dataframe
            df = pd.read_json(ndata, lines=True)
            # keep relevant columns
            df = df[['subreddit', 'subreddit_id', 'selftext', 'author', 'title', 'created_utc']].copy()
            filename = file.stem + '.csv'
            new_file = Path(comments_path, filename)
            # store in file
            df.to_csv(new_file, index=False)
            processed_files.append(file.stem)
            print('stored {0}'.format(new_file))
        except:
            print('error')
            error_files.append(file.stem)