## Subreddit Download Notebook

This notebook contains code to download subreddits from http://files.pushshift.io/reddit/subreddits/

After dowloading the files in order to decrompress the zst file:  

git clone https://github.com/facebook/zstd.git  
make  
zstd -xvf Reddit_Subreddits.ndjson.zst  

In [None]:
from urllib import request
from bs4 import BeautifulSoup
import os
from pathlib import Path
import sys
import pandas as pd
import ndjson
import json

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
else:
    data_path = Path(project_path, 'dataset')

utils_path = str(Path(project_path, 'utils'))
# including the project folder and the utils folder
if utils_path not in ''.join(sys.path):
    sys.path.extend([str(project_path), utils_path])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('')
print('sys.path = {0}'.format(sys.path))

In [None]:
basic = pd.read_csv(Path(data_path, 'subreddits_basic.csv')) 
basic.head()

In [None]:
file_object = Path(data_path, 'Reddit_Subreddits.ndjson').open().read()
data = ndjson.loads(file_object)
data[2]

In [None]:
data = pd.read_csv(Path(data_path, '69M_reddit_accounts.csv')) 
data.head()

In [None]:
data = pd.read_json(Path(data_path, 'RC_2005-12'), lines=True) 
data.head()

In [None]:
# other lists
dir_list = ['submissions', 'submissions/daily', 'submissions/xz', 'staging','comments', 'comments/daily', 'comments/xz', 'subreddits']
for name in dir_list:
    
    print('processing name = {0}'.format(name))
    
    url = 'http://files.pushshift.io/reddit/{0}/'.format(name)

    new_datapath = Path(data_path, name)
    
    file_list = os.listdir(Path(new_datapath))
    
    page = request.urlopen(url)
    soup = BeautifulSoup(page, 'html.parser')

    for element in soup.find_all('td'):

        file_name = element.a['href'].replace('./', '')

        if file_name not in file_list and '.txt' not in file_name and '.json' not in file_name:
            print('processing file name = {0}'.format(file_name))

            link_address = url + file_name
            download_filepath = Path(new_datapath, file_name)
            request.urlretrieve(link_address, download_filepath)  
            print('\t store directory = {0}'.format(download_filepath))
            
            file_list.append(file_name)
            
            print('\t saved.')