In [11]:
# console and directory access
import os
import shutil

# interacting with Amazon AWS
import boto3
from sagemaker.session import Session

# data reading and exporting  
import pandas as pd
import json
import tempfile

# parsing SEC website for data  
import requests
import time 
from bs4 import BeautifulSoup

In [12]:
file_type = 'X-17A-5'       # files looking to extract
prior2date = '20201231'     # format YYYY/MM/DD - select data prior to this data

In [13]:
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5/"

# Amazon Textract client and Sagemaker session
s3 = boto3.client('s3')
session = Session()

In [14]:
os.getcwd()

'/home/ec2-user/SageMaker/SEC_X17A5'

## Dealer Data Import
Parses in dealer information with accompaning CIK code for EDGAR lookup - data is taken from SEC dealer registration

In [15]:
if os.path.isfile('CIKandDealers.txt'):
    
    # exporting RegisteredDealer information, loading in JSON dictionary 
    with open('CIKandDealers.txt') as file:
        cik2brokers = json.load(file)
    
    # unpacking the dictionary keys (all broker dealer CIK figures)
    bdNames = [*cik2brokers]
    
else:
    print('File not found, retrieving information ...')
    
    # send request to SEC website to retrieve broker dealer information 
    response = requests.get('https://www.sec.gov/help/foiadocsbdfoiahtm.html', allow_redirects=True)

    # parse the HTML doc string from the response object
    s1Table = BeautifulSoup(response.text, 'html.parser') 
    
    # parse through links from the SEC filings
    for link in s1Table.find_all('a'):
        documentURL = link.get('href')           # document links for filings

        try:
            # find .txt file substring greater than 0
            if documentURL.find('.txt') > 0:
                # requesting data from document links storing the files
                pdf_storage = requests.get('https://www.sec.gov'+documentURL, allow_redirects=True)
                
                # open a file to store files from SEC
                open('secRegisteredDealers.txt', 'wb').write(pdf_storage.content)
                
                # convert text file to .csv and store dataframe
                df = pd.read_csv('secRegisteredDealers.txt', sep="\t", header=None)
                df = df[df.columns[:-1]]
                df.columns = ['CIK NUMBER', 'COMPANY NAME', 'REPORTING FILE NUMBER', 'ADDRESS1',
                              'ADDRESS2', 'CITY', 'STATE CODE', 'ZIP CODE']
                df.to_csv('secRegisteredDealers.csv')
                
                # convert CIK and Company Name to dictionary
                cik2brokers = df[df.columns[:2]].set_index('CIK NUMBER').to_dict(orient='index')
                
                # unpacking the dictionary keys (all broker dealer CIK figures)
                bdNames = [*cik2brokers]
                
                # exporting RegisteredDealer information
                with open('CIKandDealers.txt', 'w') as file:
                    json.dump(cik2brokers, file)
                    file.close()
                
                print('\tSEC Dealer information has been downloaded')
                break
                
        except AttributeError:
            pass

In [16]:
# sample of CIK brokers & dealer retained from SEC directory 
pd.DataFrame.from_dict(cik2brokers, orient='index').reset_index().sort_values(by='COMPANY NAME').head()

Unnamed: 0,index,COMPANY NAME
2033,1647385,16 POINTS LLC
2251,1721381,"180 DEGREE CAPITAL BD, LLC"
1528,1482939,"1851 SECURITIES, INC"
1089,1347209,"1964 GLOBAL, LLC"
3619,949918,"1ST DISCOUNT BROKERAGE, INC."


## PDF File Extraction
Extract URL links per company filing to download accompaning X-17A-5 files from SEC EDGAR site

In [17]:
n = len(bdNames)                # number of broker/dealers
secURLS = [0]*n                 # initialize memory 
startTime = time.time()         # set tuner to track code 
completeYear = None             # initialize year to check for updates

secFormat = 'https://www.sec.gov/cgi-bin/browse-edgar?'     # SEC base url
dataSelect = 'action=getcompany&CIK={}&type={}&dateb={}'    # select params.
    
# iterate through the CIK's from the broker dealers to build lookup URLs
for name in range(n):       
    url = secFormat + dataSelect.format(bdNames[name], file_type, prior2date)
    secURLS[name] = url                     # storing the search URLs

In [18]:
secURLS[:5]

['https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231',
 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2303&type=X-17A-5&dateb=20201231',
 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=2554&type=X-17A-5&dateb=20201231',
 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=3475&type=X-17A-5&dateb=20201231',
 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=3683&type=X-17A-5&dateb=20201231']

In [19]:
def secParse(completeYear:str, nLinks:int, secURLS:list, cik2brokers:dict, bdNames:list, file_type:str, 
             bucket:str, subFolder:str=None):
    """
    Parses SEC website for X-17A-5 files (void function)
    :param nLinks: (int) the number of links to extract from
    :return: None
    """
    # initialize time for process to run 
    startTime = time.time()
    
    # discover all of the pdfs that you want to parse
    s3_path = session.list_s3_files(bucket, subFolder)
    
    # the URL links for each SEC company
    for url_link in range(nLinks):

        # company name for broker dealer being downloaded
        companyName = cik2brokers[bdNames[url_link]]['COMPANY NAME']

        # logging info for when files are being downloaded
        print('{} - Downloading {} files for {} - CIK ({})'.format(url_link, file_type, 
                                                                   companyName, bdNames[url_link]))

        # requesting HTML data link from the EDGAR search results 
        response = requests.get(secURLS[url_link], allow_redirects=True)

        # parse the HTML doc string from the response object
        s1Table = BeautifulSoup(response.text, 'html.parser') 

        # parse the html-doc string for all instance of <a href=>
        for link in s1Table.find_all('a'):
            documentURL = link.get('href')           # document links for filings

            try:
                # Check for Archives header as those are contained in the filings
                check = documentURL.split('/')[1]    
                if check == 'Archives':                 

                    # document URL link for each SEC filing for given year 
                    pdf_url = 'https://www.sec.gov' + documentURL
                    year = documentURL.split('-')[1] 
                    
                    # data is organized linearly, by most recent issue first
                    # requesting data from document links storing the files
                    pdf_storage = requests.get(pdf_url, allow_redirects=True)

                    # table from filing detail Edgar table 
                    s2Table = BeautifulSoup(pdf_storage.text, 'html.parser') 

                    # extracts all link within the filing table, filtering for pdfs
                    filing_detail = s2Table.find_all('a')
                    extract_link = [file.get('href') for file in filing_detail]

                    # filter for all pdf links from the extracted file links  
                    pdf_files = [string for string in extract_link if 'pdf' in string]
                    pdf_file = 'https://www.sec.gov' + pdf_files[-1] 

                    # storing pdf details within the folder
                    pdf_name = subFolder + bdNames[url_link] + '-' + year + '.pdf'

                    # if pdf file is stored locally avoid running script further
                    # simply continue to other company (we assume all present)
                    if pdf_name in s3_path: 
                        print('\tAll files for {} are downloaded'.format(companyName))
                        break

                    else:
                        # request the specific pdf file from the the SEC
                        pdf_storage = requests.get(pdf_file, allow_redirects=True)

                        # filename for the pdf to be stored in s3
                        fileName = bdNames[url_link] + '-' +  year + '.pdf'

                        # save PDF contents to local file location 
                        open(fileName, 'wb').write(pdf_storage.content)

                        # save contents to AWS S3 bucket
                        with open(fileName, 'rb') as data:
                            s3.upload_fileobj(data, bucket, subFolder + fileName)

                        # remove local file after it has been created
                        os.remove(fileName)

                        print('\tSaved {} files for {} year {}'.format(file_type, companyName, year))
                        completeYear = year 

            # if documentURL has no split greater than length of 1, false link       
            except IndexError:
                pass

    print('Time taken in seconds is {}'.format(time.time() - startTime))

In [20]:
# call function to parse data from the SEC -> port to s3
secParse(prior2date[2:4], n, secURLS, cik2brokers, bdNames, file_type, bucket, data_folder)

0 - Downloading X-17A-5 files for ABRAHAM SECURITIES CORPORATION - CIK (1904)
	Saved X-17A-5 files for ABRAHAM SECURITIES CORPORATION year 20
	All files for ABRAHAM SECURITIES CORPORATION are downloaded
1 - Downloading X-17A-5 files for PROSPERA FINANCIAL SERVICES, INC. - CIK (2303)
	All files for PROSPERA FINANCIAL SERVICES, INC. are downloaded
2 - Downloading X-17A-5 files for AEI SECURITIES, INC. - CIK (2554)
	All files for AEI SECURITIES, INC. are downloaded
3 - Downloading X-17A-5 files for ALEXANDER INVESTMENT SERVICES CO. - CIK (3475)
	All files for ALEXANDER INVESTMENT SERVICES CO. are downloaded
4 - Downloading X-17A-5 files for ALLEN & COMPANY LLC - CIK (3683)
	All files for ALLEN & COMPANY LLC are downloaded
5 - Downloading X-17A-5 files for M. E. ALLISON & CO., INC. - CIK (3995)
	All files for M. E. ALLISON & CO., INC. are downloaded
6 - Downloading X-17A-5 files for ALLISON-WILLIAMS COMPANY - CIK (3999)
	All files for ALLISON-WILLIAMS COMPANY are downloaded
7 - Downloading