In [2]:
%%bash
pip install --upgrade pip
pip install PyPDF2

Collecting PyPDF2
  Downloading PyPDF2-1.26.0.tar.gz (77 kB)
Building wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py): started
  Building wheel for PyPDF2 (setup.py): finished with status 'done'
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-py3-none-any.whl size=61085 sha256=32a7297a0bf3987878d42367d880642a829c9ad9c054c8ca6aaacfe294f7d5eb
  Stored in directory: /home/ec2-user/.cache/pip/wheels/97/28/4b/142b7d8c98eeeb73534b9c5b6558ddd3bab3c2c8192aa7ab30
Successfully built PyPDF2
Installing collected packages: PyPDF2
Successfully installed PyPDF2-1.26.0


In [3]:
# console and directory access
import os
import re
import json
import datetime

# interacting with Amazon AWS
import boto3
from sagemaker.session import Session

# data reading and exporting  
import pandas as pd
import numpy as np

# parsing SEC website for data  
import requests
import time 
from bs4 import BeautifulSoup

# pdf manipulation
from PyPDF2 import PdfFileReader, PdfFileWriter, utils

In [3]:
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5/"

# Amazon Textract client and Sagemaker session
s3 = boto3.client('s3')
session = Session()

file_type = 'X-17A-5'       # files looking to extract
prior2date = '20201231'     # format YYYY/MM/DD - select data prior to this data

## PDF File Extraction
Extract URL links per company filing to download accompaning X-17A-5 files from SEC EDGAR site

In [4]:
# J.P. MORGAN SECURITIES LLC, GOLDMAN SACHS & CO. LLC, MORGAN STANLEY & CO. LLC, CITIGROUP GLOBAL MARKETS INC.
# NOMURA SECURITIES INTERNATIONAL, INC., WELLS FARGO SECURITIES, LLC, BARCLAYS CAPITAL INC.
# HSBC SECURITIES (USA) INC., DEUTSCHE BANK SECURITIES INC.
big_banks = ['782124', '42352', '68136', '91154', '72267', '1224385', '851376', '853784', '58056']

In [5]:
def edgarParse(url:str) -> tuple:
    """
    Parses the EDGAR webpage of a provided URL and returns a tuple of arrays/lists
    
    Input:
        :param: url (type str) 
            URL is a string representing a SEC website URL pointing to the 
            e.g. https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231

    Return:
        :param: filing_dates (type numpy array)
            A vector of date strings for all X-17A-5 filings in chronological order
        :param: archives (type list)
            A vector of strings for all sec.gov URL links for each filings in chronological order
    """
    
    # requesting HTML data link from the EDGAR search results 
    response = requests.get(url, allow_redirects=True)

    # parse the HTML doc string from the response object
    soup = BeautifulSoup(response.text, 'html.parser') 
    
    # read in HTML tables from the url link provided 
    try:
        filings = pd.read_html(url)[2]                 # select the filings table from EDGAR search (IndexError Flag)
        filing_dates = filings['Filing Date'].values   # select the filing dates columns

        # parse the html-doc string for all instance of < a href= > from the URL 
        href = [link.get('href') for link in soup.find_all('a')]

        # search for all links with Archive in handle, these are the search links for the X-17A-5 filings
        archives = ['https://www.sec.gov' + link for link in href if str.find(link, 'Archives') > 0]
        
        # return a tuple of vectors, the filings dates and the corresponding urls
        return filing_dates, archives
    
    # if we can't select the filings table we flag an error
    except IndexError:
        print('Currently no filings are present for the firm\n')
        

In [9]:
def mergePdfs(files:list) -> PdfFileWriter:
    """
    Combines pdfs files iteratively by page for each of the accompanying SEC filings 
    
    Input:
        :param: files (type List)
            A list of pdfs retrieved from filing details for each broker-detal in Edgar's website
            e.g. https://www.sec.gov/Archives/edgar/data/1904/000000190420000002/0000001904-20-000002-index.htm

    Return:
        :param: pdfWriter (type PdfFileWriter)
            A PdfFileWriter object that serves as a container to store each of the select pdf files from our
            list into a larger merged pdf 
    """
    
    # initialize a pdf object to be store pdf pages
    pdfWriter = PdfFileWriter()
    
    for pdf in files:
        pdf_file = 'https://www.sec.gov' + pdf 
        
        # request the specific pdf file from the the SEC
        pdf_storage = requests.get(pdf_file, allow_redirects=True)

        # save PDF contents to local file location 
        open('temp.pdf', 'wb').write(pdf_storage.content)
        
        # read pdf file as PyPDF2 object
        pdf = PdfFileReader('temp.pdf', strict=False) 
        nPages = pdf.getNumPages()          # detemine the number of pages in pdf
        
        # add the pages from the document as specified 
        for page_num in np.arange(nPages):
            pdfWriter.addPage(pdf.getPage(page_num))
    
    os.remove('temp.pdf')
    return pdfWriter

In [9]:
def fileExtract(cik2brokers:dict, cik_list:list, subFolder:str='Input/X-17A-5/', file_type:str='X-17A-5', 
                bucket:str='ran-s3-systemic-risk', flag=True):
    """
    Parses through the pdf links X-17A-5 pdf files to be saved in an s3 bucket
    
    Input:
        :param: cik2brokers (type dict)
            CIK dictionary broker dealer information stored as a CIK:Company Name relations (see belwo)
            e.g. {'years-reported': ['2020/QTR1'], 'broker-dealers': {1904: 'ABRAHAM SECURITIES CORPORATION'}}
        :param: cik_list (type list)
            An iteratable sequence of integers or strings that correspond to broker-dealer CIKs
        :param: subFolder (type str)
            The subfolder on the s3 where all extracted files will be stored. we default to 'Input/X-17A-5/'
        :param: file_type (type str)
            File type to look from the EDGAR website, we default to the 'X-17A-5' filings
        :param: bucket (type str)
            The s3-bucket wheere we store our datasets, we default to 'ran-s3-systemic-risk'
        :param: flag (type bool)
            The flag gives users the option to be efficient or thorough in their search or update of data,
            assuming all files are present if and only if the top line release is present (e.g. 2020 present ->
            implies that 2019, 2018, 2017, ... are present). We default to True.    

    Return:
        This is a void function, we return no value(s) as we interface with AWS s3 bucket to store pdfs
    
    NOTE:   This script makes no effort to weed out amended releases, rather it will default to retaining 
            information on first published releases via iterative selection from the most recent filing 
    """
    
    # check available pdfs stored within desired output-folder
    s3_path = session.list_s3_files(bucket, subFolder)
    
    # the URL links for each SEC company
    for index, cik in enumerate(cik_list):
        
        # forming the SEC search URLs from the select CIK, file type and date range
        secFormat = 'https://www.sec.gov/cgi-bin/browse-edgar?'     # SEC base url
        dataSelect = 'action=getcompany&CIK={}&type={}&dateb={}'    # select params.

        # build lookup URLs for the SEC level data 
        url = secFormat + dataSelect.format(cik, file_type, datetime.datetime.today().year)
        
        try:
            # return the filing dates and archived url's for each SEC company 
            filing_dates, archives = edgarParse(url)

            # company name for broker dealer being downloaded
            companyName = cik2brokers['broker-dealers'][cik]

            # logging info for when files are being downloaded
            print('{} - Downloading {} files for {} - CIK ({})'.format(index, file_type, companyName, cik))

            # itterate through each of the pdf URLs corresponding to archived contents
            for i, pdf_url in enumerate(archives):

                # filing date in full yyyy-MM-dd format
                date = filing_dates[i]

                # data is organized linearly, by most recent issue first
                # requesting data from document links storing the files
                pdf_storage = requests.get(pdf_url, allow_redirects=True)

                # table from filing detail Edgar table 
                soup = BeautifulSoup(pdf_storage.text, 'html.parser') 

                # extracts all link within the filing table, filtering for pdfs
                extract_link = [file.get('href') for file in soup.find_all('a')]

                # filter for all pdf links from the extracted file links  
                pdf_files = [string for string in extract_link if str.find(string, 'pdf') > 0]

                # check to see if a pdf file exists to extract, otherwise move on 
                try:
                    
                    file_name = str(cik) + '-' + date + '.pdf'
                    pdf_name = subFolder + file_name
                    
                    if (pdf_name in s3_path) & (flag == True): 
                        print('\tAll files for {} are downloaded'.format(companyName))
                        break

                    else:
                        # concat all pdf files from the pdf_files list, merging all to one large pdf
                        concatPdf = mergePdfs(pdf_files)
                        
                        # open file and save to local instance
                        with open(file_name, 'wb') as f:
                            concatPdf.write(f)
                            f.close()

                        # save contents to AWS S3 bucket
                        with open(file_name, 'rb') as data:
                            s3.upload_fileobj(data, bucket, pdf_name)

                        # remove local file after it has been created
                        os.remove(file_name)

                        print('\tSaved {} files for {} on {}'.format(file_type, companyName, date))

                except IndexError:
                    pass
            
        except TypeError:
            pass


In [11]:
# read all CIK and Dealer name information
with open('CIKandDealers.txt', 'r') as f: cik2brokers = json.loads(f.read())

In [13]:
# call function to parse data from the SEC -> port to s3
fileExtract(cik2brokers, big_banks, flag=True)

0 - Downloading X-17A-5 files for J.P. MORGAN SECURITIES LLC  - CIK (782124)
	All files for J.P. MORGAN SECURITIES LLC  are downloaded

Time taken for loop in minutes is 0.01020519733428955

1 - Downloading X-17A-5 files for GOLDMAN SACHS & CO. LLC  - CIK (42352)
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2020
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2019
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2018
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2017
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2016
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2015
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2014
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2013
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2012
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2011
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2010
	Saved X-17A-5 files for GOLDMAN SACHS & CO. LLC  year 2010
	Saved X-17A-5 f



	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2020
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2019
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2018
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2017
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2016
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2015
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2014
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2013
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2012
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2011
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2010
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2009
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2008
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2007
	Saved X-17A-5 files for WELLS FARGO SECURITIES, LLC  year 2006
	Saved X-17A-5 files for WELLS FARGO SEC