In [3]:
%%bash
pip install --upgrade pip
pip install PyPDF2
pip install jupyterthemes

Collecting pip
  Using cached pip-20.3.3-py2.py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.3
    Uninstalling pip-20.3:
      Successfully uninstalled pip-20.3
Successfully installed pip-20.3.3
Collecting PyPDF2
  Downloading PyPDF2-1.26.0.tar.gz (77 kB)
Building wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py): started
  Building wheel for PyPDF2 (setup.py): finished with status 'done'
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-py3-none-any.whl size=61084 sha256=d9633c98eb76e0d9557778a36505ac608d1d5b371c3ac9070f9056ddad6bc016
  Stored in directory: /home/ec2-user/.cache/pip/wheels/97/28/4b/142b7d8c98eeeb73534b9c5b6558ddd3bab3c2c8192aa7ab30
Successfully built PyPDF2
Installing collected packages: PyPDF2
Successfully installed PyPDF2-1.26.0
Collecting jupyterthemes
  Downloading jupyterthemes-0.20.0-py2.py3-none-any.whl (7.0 MB)
Collecting lesscpy>=0.11.2
  Downloading 

In [37]:
# console and directory access
import os
import re
import datetime

# interacting with Amazon AWS
import boto3
from sagemaker.session import Session

# data reading and exporting  
import pandas as pd
import numpy as np

# parsing SEC website for data  
import requests
import time 
from bs4 import BeautifulSoup

# pdf manipulation
from PyPDF2 import PdfFileReader, PdfFileWriter, utils

In [53]:
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5/"

# Amazon Textract client and Sagemaker session
s3 = boto3.client('s3')
session = Session()

file_type = 'X-17A-5'       # files looking to extract
prior2date = '20201231'     # format YYYY/MM/DD - select data prior to this data

In [13]:
# ## Dealer Data Import
# Parses in dealer information with accompaning CIK code for EDGAR lookup from the SEC dealer registration

In [12]:
# if os.path.isfile('CIKandDealers.txt'):
#     print('Broker dealer data has been found')
    
#     # exporting RegisteredDealer information, loading in JSON dictionary 
#     with open('CIKandDealers.txt') as file:
#         cik2brokers = json.load(file)
    
#     # unpacking the dictionary keys (all broker dealer CIK figures)
#     bdNames = [*cik2brokers]
# else:
#     print('File not found, retrieving information ...')
#     start = time.time()
    
#     # will be used to concat all available broker dealer information 
#     tempDF = []
    
#     # send request to SEC website to retrieve broker dealer information 
#     response = requests.get('https://www.sec.gov/help/foiadocsbdfoiahtm.html', allow_redirects=True)

#     # parse the HTML doc string from the response object
#     s1Table = BeautifulSoup(response.text, 'html.parser') 
    
#     # parse through links from the SEC filings
#     for link in s1Table.find_all('a'):
#         documentURL = link.get('href')           # document links for filings
        
#         try:
#             # find .txt file substring greater than 0
#             if documentURL.find('.txt') > 0:
                
#                 # requesting data from document links storing the files
#                 pdf_storage = requests.get('https://www.sec.gov'+ documentURL, allow_redirects=True)
                
#                 # open a file to store files from SEC
#                 open('secDealers.txt', 'wb').write(pdf_storage.content)
                
#                 # convert text file to .csv and store dataframe
#                 df = pd.read_csv('secDealers.txt', sep="\t", header=None)
                
#                 dateRelease = documentURL.split('/')[-1][2:-4]
#                 print('\tSEC Dealer information has been downloaded ' + dateRelease)
                
#                 # append new dataframe figures to be concated
#                 tempDF.append(df)
                
#         except AttributeError:
#             pass
    
#     # concat all disjoint dataframes
#     df = pd.concat(tempDF)
    
#     # remove NaN columns and provide column naming conventions
#     df = df[df.columns[:-1]]
#     df.columns = ['CIK NUMBER', 'COMPANY NAME', 'REPORTING FILE NUMBER', 'ADDRESS1',
#                   'ADDRESS2', 'CITY', 'STATE CODE', 'ZIP CODE']
    
#     # remove duplicate rows from the dataset
#     df = df.drop_duplicates(subset='CIK NUMBER')
#     df.to_csv('secRegisteredDealers.csv')

#     # convert CIK and Company Name to dictionary
#     cik2brokers = df[df.columns[:2]].set_index('CIK NUMBER').to_dict(orient='index')

#     # unpacking the dictionary keys (all broker dealer CIK figures)
#     bdNames = [*cik2brokers]

#     # exporting RegisteredDealer information
#     with open('CIKandDealers.txt', 'w') as file:
#         json.dump(cik2brokers, file)
#         file.close()
    
#     print('Time taken is {} seconds'.format(time.time()-start))

## PDF File Extraction
Extract URL links per company filing to download accompaning X-17A-5 files from SEC EDGAR site

In [52]:
# J.P. MORGAN SECURITIES LLC, GOLDMAN SACHS & CO. LLC, MORGAN STANLEY & CO. LLC, CITIGROUP GLOBAL MARKETS INC.
# NOMURA SECURITIES INTERNATIONAL, INC., WELLS FARGO SECURITIES, LLC, BARCLAYS CAPITAL INC.
# HSBC SECURITIES (USA) INC., DEUTSCHE BANK SECURITIES INC.
big_banks = ['782124', '42352', '68136', '91154', '72267', '1224385', '851376', '853784', '58056']

In [25]:
def edgarParse(url:str) -> tuple:
    """
    Parses the EDGAR webpage of a provided URL and returns a tuple of arrays/lists
    
    Input:
        url is a string representing a SEC website URL pointing to the 
        e.g. https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231

    Return:
        :param: filing_dates
            A vector of date strings for all X-17A-5 filings in chronological order
        :param: archives
            A vector of strings for all sec.gov URL links for each filings in chronological order
    """
    
    # requesting HTML data link from the EDGAR search results 
    response = requests.get(url, allow_redirects=True)

    # parse the HTML doc string from the response object
    soup = BeautifulSoup(response.text, 'html.parser') 
    
    # read in HTML tables from the url link provided 
    try:
        filings = pd.read_html(url)[2]                 # select the filings table from EDGAR search (IndexError Flag)
        filing_dates = filings['Filing Date'].values   # select the filing dates columns

        # parse the html-doc string for all instance of < a href= > from the URL 
        href = [link.get('href') for link in soup.find_all('a')]

        # search for all links with Archive in handle, these are the search links for the X-17A-5 filings
        archives = ['https://www.sec.gov' + link for link in href if str.find(link, 'Archives') > 0]
        
        # return a tuple of vectors, the filings dates and the corresponding urls
        return filing_dates, archives
    
    # if we can't select the filings table we flag an error
    except IndexError:
        print('Currently no filings are present for the firm\n')
        

In [49]:
def mergePdfs(files:list) -> PdfFileWriter:
    """
    Combines pdfs files iteratively by page for each of the accompanying SEC filings 
    """
    # initialize a pdf object to be store pdf pages
    pdfWriter = PdfFileWriter()
    
    for pdf in files:
        pdf_file = 'https://www.sec.gov' + pdf 
        
        # request the specific pdf file from the the SEC
        pdf_storage = requests.get(pdf_file, allow_redirects=True)

        # save PDF contents to local file location 
        open('temp.pdf', 'wb').write(pdf_storage.content)
        
        # read pdf file as PyPDF2 object
        pdf = PdfFileReader('temp.pdf', strict=False) 
        nPages = pdf.getNumPages()          # detemine the number of pages in pdf
        
        # add the pages from the document as specified 
        _ = [pdfWriter.addPage(pdf.getPage(page_num)) for page_num in np.arange(nPages)]
    
    os.remove('temp.pdf')
    return pdfWriter

In [50]:
def fileExtract(cik2brokers:dict, bdNames:list, subFolder:str='Input/X-17A-5/', file_type:str='X-17A-5', 
                prior2date:str='20201231', bucket:str='ran-s3-systemic-risk', flag=True):
    """
    Parses through the pdf links X-17A-5 pdf files to be saved in an s3 bucket
    
    NOTE:   This script makes no effort to weed out amended releases, rather it will default to retaining 
            information on first published releases via iterative retention 
    """
    
    # discover all of the pdfs that you want to parse
    s3_path = session.list_s3_files(bucket, subFolder)
    
    # initialize time for process to run (track time)
    startTime = time.time()
    
    # the URL links for each SEC company
    for index, cik in enumerate(bdNames):
        
        # forming the SEC search URLs from the select CIK, file type and date range
        secFormat = 'https://www.sec.gov/cgi-bin/browse-edgar?'     # SEC base url
        dataSelect = 'action=getcompany&CIK={}&type={}&dateb={}'    # select params.

        # build lookup URLs for the SEC level data 
        url = secFormat + dataSelect.format(cik, file_type, prior2date)
        
        try:
            # return the filing dates and archived url's for each SEC company 
            filing_dates, archives = edgarParse(url)

            # company name for broker dealer being downloaded (remove company name handle after readDealerData finishes)
            companyName = cik2brokers[cik]['COMPANY NAME']

            # logging info for when files are being downloaded
            print('{} - Downloading {} files for {} - CIK ({})'.format(index, file_type, companyName, cik))

            # itterate through each of the pdf URLs corresponding to archived contents
            for i, pdf_url in enumerate(archives):

                # filing year in full yyyy-MM-dd format, extracting yyyy portion 
                year = filing_dates[i][:4]

                # data is organized linearly, by most recent issue first
                # requesting data from document links storing the files
                pdf_storage = requests.get(pdf_url, allow_redirects=True)

                # table from filing detail Edgar table 
                soup = BeautifulSoup(pdf_storage.text, 'html.parser') 

                # extracts all link within the filing table, filtering for pdfs
                extract_link = [file.get('href') for file in soup.find_all('a')]

                # filter for all pdf links from the extracted file links  
                pdf_files = [string for string in extract_link if str.find(string, 'pdf') > 0]

                # check to see if a pdf file exists to extract, otherwise move on 
                try:
                    # our pdf file of interst tends to be the last pdf in the list (we index -1 for last)
                    file_name = str(cik) + '-' + year + '.pdf'
                    pdf_name = subFolder + file_name
                    
                    # flag gives users the option to be efficient or thorough in their search or update of data
                    # if flag is active and pdf_name exists in s3 bucket simply continue to other company, assume present 
                    if (pdf_name in s3_path) & (flag == True): 
                        print('\tAll files for {} are downloaded'.format(companyName))
                        break

                    else:
                        # concat all pdf files from the pdf_files list, merging all to one large pdf
                        concatPdf = mergePdfs(pdf_files)
                        
                        # open file and save to local instance
                        with open(file_name, 'wb') as f:
                            concatPdf.write(f)
                            f.close()

                        # save contents to AWS S3 bucket
                        with open(file_name, 'rb') as data:
                            s3.upload_fileobj(data, bucket, pdf_name)

                        # remove local file after it has been created
                        os.remove(file_name)

                        print('\tSaved {} files for {} year {}'.format(file_type, companyName, year))

                except IndexError:
                    pass
            
        except TypeError:
            pass
            
        print('\nTime taken for loop in minutes is {}\n'.format((time.time() - startTime)/60))

In [51]:
# call function to parse data from the SEC -> port to s3
fileExtract(cik2brokers, big_banks[:1], flag=False)

0 - Downloading X-17A-5 files for J.P. MORGAN SECURITIES LLC  - CIK (782124)
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2020
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2019
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2018
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2018
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2018
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2017
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2017
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2016
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2015
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2014
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2013
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2012
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2011
	Saved X-17A-5 files for J.P. MORGAN SECURITIES LLC  year 2010
	Saved X-17A-5 files for J.P. MORGAN SECU