In [1]:
import json
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import shutil

%matplotlib inline

import tarfile
import os.path

def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))

def null_safe_get_strip(dic, value):
    """Get an value from a dictionary and strip it."""
    value = dic.get(value,None)
    if value is None:
        return None
    
    if not hasattr(value, 'strip'):
        value = ' '.join(value)
        
    value = value.strip()
    if len(value) == 0:
        return None
    return value

def null_safe_strip(value):
    if value is None:
        return None
    return value.strip()

# Overview

This notebook contains code to select the files we want for each bank from all the scraped data and saves it into seperate compressed files per bank.

In [2]:
# Configuration
START_YEAR = 2010 # drop any files/data from before this year.
DATA_PATH = Path("~/dev/devday/data").expanduser()
ZIP_PATH = Path("~/dev/devday/data/zips").expanduser()

### RBA
#### Bulletins

Bulletins are available as html, so we can directly extract the references and abstracts without downloading pdf documents.

In [266]:
with open('bulletin.json','r') as f:
    rba_bul = json.load(f)
    
outdir = Path(ZIP_PATH)/'RBA_bulletins'
outdir.mkdir(parents=True, exist_ok=True)

rows = []
ts = []
for p in rba_bul:
    row = {}
    date = p['pubdate']
    if date is None:
        # extract it from pubtitle
        datematch = re.search('([a-zA-Z]{3,10})\sQuarter\s(\d{4})',p['pubtitle'])
        if datematch:
            date = datetime.strptime(f"{datematch.group(1)}-{datematch.group(2)}",'%B-%Y') 
        
    row['pubdate'] = date
    row['abstract'] = null_safe_get_strip(p,'abstract')
    row['references'] = p.get('references',None)
    row['has_refs'] = row['references'] is not None
    
    filepath = Path(p['html_path'])
    dest = outdir/filepath.name
    #shutil.copy(filepath, dest)
    row['file'] = filepath.name

    rows.append(row) 

df = pd.DataFrame(rows)
df['pubdate'] = pd.to_datetime(df['pubdate'])
df['year'] = df['pubdate'].dt.year

df.to_csv(outdir/'metadata.csv')
#make_tarfile(ZIP_PATH/'rba_bulletins.tar.gz',outdir)
#shutil.rmtree(outdir)

In [268]:
df['has_refs'].mean()

1.0

#### RDPs

In [253]:
with open('rdps.json','rb') as f:
    rba_wp = json.load(f)
    
rba_wp[0].keys()

dict_keys(['rdp_number', 'abstract', 'pubdate', 'authors', 'links', 'file_urls', 'files'])

In [106]:
with open('rdps.json','rb') as f:
    rba_wp = json.load(f)
    
outdir = Path(ZIP_PATH)/'RBA_rdps'
outdir.mkdir(parents=True, exist_ok=True)

rows = []
for d in rba_wp:
    if d['pubdate'] is not None:
        pubdate = d['pubdate'].strip()
        date = re.search('[a-z]{3,9} \d{4}',pubdate)
        if date:
            s,e = date.span()
            pubdate = datetime.strptime(pubdate[s-1:e],'%B %Y')
            nfiles = len(d['files'])
            filename = None
            if nfiles == 1:
                filepath = Path((d['files'][0])['path'])
                filename = filepath.name
                src = DATA_PATH/filepath
                dest = outdir/filename
                shutil.copy(src,dest)
                
            rows.append({
                'rdp':d['rdp_number'],
                'abstract':d['abstract'],
                'date':pubdate,
                'year':pubdate.year,
                'authors':null_safe_strip(d['authors']),
                'nfiles':nfiles,
                'file':filename,
            })
            
df = pd.DataFrame(rows)
df.to_csv(outdir/'metadata.csv')
make_tarfile(ZIP_PATH/'rba_rdp.tar.gz',outdir)
shutil.rmtree(outdir)

### BOE

In [190]:
with open('papers.json','rb') as f:
    boe_wp = json.load(f)

outdir = Path(ZIP_PATH)/'boe_rdps'
outdir.mkdir(parents=True, exist_ok=True)

rows = []
for p in boe_wp:
    row = {}
    datematch = re.search("\d{2} [a-zA-Z]{3,10} \d{4}",p['pubdate'])
    row['pubdate'] = datetime.strptime(datematch.group(),'%d %B %Y')
    row['abstract'] = null_safe_get_strip(p,'abstract')
    row['year'] = row['pubdate'].year
    
    filename = None
    if len(p['files']) == 1:
        filepath = Path(p['files'][0]['path'])
        filename = filepath.name
        src = DATA_PATH/filepath
        dest = outdir/filename
        #shutil.copy(src,dest)
    row['file'] = filename
    
    if row['year'] > START_YEAR:
        rows.append(row)
    
    
df = pd.DataFrame(rows)  
df['year'] = df['pubdate'].dt.year
df['has_abstract'] = df['abstract'].notnull()
df.to_csv(outdir/'metadata.csv')
make_tarfile(ZIP_PATH/'boe_rdps.tar.gz',outdir)
shutil.rmtree(outdir)

In [260]:
# no per file metadata here - just all files by year.
with open('boe_bull2.json','rb') as f:
    boe_bul = json.load(f) 

outdir = outdir = Path(ZIP_PATH)/'boe_bulletins'
outdir.mkdir(parents=True, exist_ok=True)
rows = []
for year_data in boe_bul:
    year = year_data['year']
    if int(year) > START_YEAR:
        for file in year_data['files']:
            row = {'year':year}
            src = DATA_PATH/file['path']
            dest = outdir/src.name
            shutil.copy(src,dest)
            row['file'] = src.name
            rows.append(row)
            
df = pd.DataFrame(rows)
df.to_csv(outdir/'metadata.csv')
make_tarfile(ZIP_PATH/'boe_bulletins.tar.gz',outdir)
shutil.rmtree(outdir)

### Fed

#### Notes
- content is available as html

In [14]:
# parse the fed notes - full note available as html

outdir = Path(ZIP_PATH)/'fed_notes'
outdir.mkdir(parents=True, exist_ok=True)
with open('fed_notes_meta.json','r') as f:
    data = json.load(f)
rows = []
for d in data:
    date = pd.to_datetime(d['pubdate'])
    
    row = {}
    row['title'] = d['title']
    row['pubdate'] = d['pubdate']
    row['references'] = d.get('references',None)
    row['summary'] = null_safe_get_strip(d,'summary')
    htmlpath = Path(d['html_path'])
    row['file'] = htmlpath.name
    dest = outdir/htmlpath.name
    shutil.copy(htmlpath,dest)
    rows.append(row)
    
    
df = pd.DataFrame(rows)
df['pubdate'] = pd.to_datetime(df['pubdate'])
df['year'] = df['pubdate'].dt.year
df.to_csv(outdir/'metadata.csv')

make_tarfile(ZIP_PATH/'fed_notes.tar.gz',outdir)

In [13]:
shutil.rmtree(outdir)

In [22]:
pd.to_datetime(d['pubdate'])

Timestamp('2017-01-04 00:00:00')

#### FEDS

In [37]:
outdir = Path(ZIP_PATH)/'fed_feds'
outdir.mkdir(parents=True, exist_ok=True)

with open('feds2.json','rb') as f:
    fed_wp = json.load(f)

rows = []
for p in fed_wp:
    date = datetime.strptime(p['pubdate'],'%B %Y')
    if date.year >= START_YEAR:
        row = {}
        row['pubdate'] = date
        row['year'] = date.year
        row['title'] = null_safe_get_strip(p,'title')
        filepath = p['files'][0]['path']
        filename = Path(filepath).name
        row['file'] = filename
        
        src = DATA_PATH/filepath
        dest = outdir/filename
        
        shutil.copy(src,dest)

    rows.append(row)
df = pd.DataFrame(rows)
df.to_csv(outdir/'metadata.csv')

make_tarfile(ZIP_PATH/'fed_feds.tar.gz',outdir)

In [28]:
df.iloc[0,3]

[{'url': 'https://www.federalreserve.gov/pubs/feds/2014/201418/201418pap.pdf',
  'path': 'full/3a77991d769b316e2d28f430e505cce24f0c1970.pdf',
  'checksum': 'b34e45814c411426f57b217530d39769',
  'status': 'downloaded'}]

In [30]:
df.groupby('nfiles').size()

nfiles
1    1759
dtype: int64

In [33]:
df.head()

Unnamed: 0,pubdate,year,title,file,nfiles
0,2014-03-01,2014,Small Price Responses to Large Demand Shocks,3a77991d769b316e2d28f430e505cce24f0c1970.pdf,1
1,2014-03-01,2014,Finance and Productivity Growth: Firm-level Ev...,3fa79c4dde27327dde29fdeef8ed579a3dad98eb.pdf,1
2,2014-02-01,2014,Using Data on Seller Behavior to Forecast Shor...,5d012ea2cb9f4e8a8b19bee878d0f646dcb6509e.pdf,1
3,2014-02-01,2014,Banks as Patient Fixed Income Investors,995f4b0c60cb6c6a76901899c9075a5b7fd253d8.pdf,1
4,2014-02-01,2014,The Interplay Between Student Loans and Credit...,b6124a03eabcc81b7279f703f05f03d595fb94cc.pdf,1


In [16]:
df.groupby('nfiles').size()

nfiles
0    1759
dtype: int64

In [241]:
# content is pdfs and I seem to be missing the links!!

In [242]:
# need to work on this scraper

### Riksbank

In [147]:
def extract_riks_metadata_and_files(meta_json, outdirname, outzipname):
    with open(meta_json,'rb') as f:
        rbwp = json.load(f)
    outdir = Path(ZIP_PATH)/outdirname
    outdir.mkdir(parents=True, exist_ok=True)

    rows = []
    for p in rbwp:
        row = {}
        row['pubdate'] = datetime.strptime(p['pubdate'],'%d/%m/%Y')
        row['title'] = null_safe_get_strip(p, 'title')
        filename = None
        if len(p['files']) == 1:
            file = p['files'][0]
            filepath = Path(file['path'])
            filename = filepath.name
            src = DATA_PATH/filepath
            dest = outdir/filename
            shutil.copy(src,dest)  
        row['file'] = filename
        row['url'] = p['pdf_url']
        row['has_file'] = filename is not None
        rows.append(row)
    df = pd.DataFrame(rows)
    df['year'] = df['pubdate'].dt.year   

    df = pd.DataFrame(rows)
    df.to_csv(outdir/'metadata.csv')
    make_tarfile(ZIP_PATH/outzipname,outdir)
    shutil.rmtree(outdir)
    return df
    
df = extract_riks_metadata_and_files('riksbank_wp_meta.json','riksbank_wp','riks_rdp.tar.gz')

In [148]:
df = extract_riks_metadata_and_files('riksbank_comm_meta.json','riksbank_comm','ricks_comm.tar.gz')