In [12]:
# console and directory access
import os
import re
import datetime

# interacting with Amazon AWS
import boto3
from sagemaker.session import Session

# data reading and exporting  
import json
import pandas as pd
import numpy as np

# parsing SEC website for data  
import requests 
from bs4 import BeautifulSoup

## Archived Dealer Data Import from 1993
Parses in dealer information with accompaning CIK code for EDGAR lookup from the SEC dealer registration

In [2]:
def companyName(cik) -> str:
    """
    Returns the company name for a given CIK number from the SEC by parsing the Edgar site
    e.g. https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231
    
    Input:
        :param: cik (type str)
            The CIK number for a broker dealer e.g. 887767
    Return:
        :param: (type str)
            Returns the accompanying name with the CIK provided e.g. 1ST GLOBAL CAPITAL CORP. 
    """
    # establishing base-url for company name search
    baseURL = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&'
    url = baseURL+'CIK={}&type=X-17A-5&dateb=20201231'.format(cik)
    
    # response time for retrieving company names, returning beautifulsoup object
    res = requests.get(url, allow_redirects=True)
    s1 = BeautifulSoup(res.text, 'html.parser')
    
    # select the company information from the SEC website for a particular CIK
    for val in s1.find_all('span', attrs={"class":"companyName"}):
        # retrieve the company name from info class
        return val.text.split('CIK')[0].split('/BD')[0]

In [3]:
def dealerData(years:list, quarters:list=['QTR1', 'QTR2', 'QTR3', 'QTR4'], 
               cik2brokers:dict={'years-covered':[], 'broker-dealers':{}}) -> dict:
    """
    Retrieve dealer data from archived SEC directory, returns a dictionary of essential information
    
    Input:
        :param: years (type list)
            A list of years to check for additional dealer data to be pulled e.g. [1993, 1994, 2000]. NOTE, that
            only the years specified are checked for dealer information. 
        :param: quarters (type list)
            A list of quarters to check for additional dealer data, string must be of the form "QTRX", where X is 
            an integer from 1-4 inclusive default = [QTR1, QTR2, QTR3, QTR4]. 
        :param: cik2brokers (type dictionary)
            A nested dictionary for storing the broker-dealer data as well as the years covered from the archive 
            e.g. {'years-reported': ['2020-QTR1', '2020-QTR2'], 'broker-dealers': {1904: 'ABRAHAM SECURITIES CORPORATION'}}. 
    Return:
        :param: cik2brokers (type dict)
            Returns a dictionary with CIK:CompanyName relationships e.g. {887767: 1ST GLOBAL CAPITAL CORP.} as well 
            as metadata on how many years and quarters were surved last
    """
    
    # archived data website for broker dealer data
    baseURL = 'https://www.sec.gov/Archives/edgar/full-index'
    
    # construct a list of URLs to search through, point to archived search
    archivedURLs = ['{}/{}/{}/form.idx'.format(baseURL, year, qt) for year in years for qt in quarters]
    print('Archived urls have been created and are stored locally\n')

    # itterate through years and quarters for archival search
    for searchURL in archivedURLs:
        print(searchURL)

        # send request to SEC website to retrieve broker dealer information 
        response = requests.get(searchURL, allow_redirects=True)

        # e.g. response data format retrieved from the archived form.idx
        # Description:           Master Index of EDGAR Dissemination Feed by Company Name
        # Last Data Received:    March 31, 2010
        # Comments:              webmaster@sec.gov
        # Anonymous FTP:         ftp://ftp.sec.gov/edgar/
        #
        # Company Name      Form Type   CIK         Date Filed  File Name
        # -------------------------------------------------------------------------------------------------------
        # 'mktg, inc.'      10-K        886475      2010-01-20  edgar/data/886475/0001019056-10-000046.txt          
        # 'mktg, inc.'      10-Q        886475      2010-01-20  edgar/data/886475/0001019056-10-000047.txt          
        # 'mktg, inc.'      10-Q        886475      2010-01-20  edgar/data/886475/0001019056-10-000048.txt          

        # extract only main text from body, selecting terms below dashes '---' 
        # we use triple dashes to avoid improper splits that exist locally with company names
        data = response.text.split('---')[-1]   

        # write contents to a temporary file to read information
        with open('main.txt', 'w') as file: file.write(data)

        # convert text data to dataframe object using a fixed-width-file convention
        df = pd.read_fwf('main.txt', header=None)
        cleanDf = df[~pd.isnull(df[0])]                          # strip away rows with NaN from the Form Type

        # check to see if first column contains information on X-17A-5 filings (use regex for X-17a flag)
        x17_check = cleanDf[0].str.contains('^x-17a', regex=True, flags=re.IGNORECASE)
        x17File = cleanDf[x17_check]

        # check whether X-17A-5 form type was found (if empty pass)
        if not x17File.empty:
            
            # CIK number is taken from the last column of the rows splitting url string by row 
            # e.g. edgar/data/886475/0001019056-10-000046.txt -> 886475
            cikNumbers = x17File[x17File.columns[-1]].apply(lambda x: x.split('/')[2]).values

            # iterate through CIK elements  
            for elm in cikNumbers:
                compName = companyName(elm)         # company name for CIK retrieved from SEC
                cik2brokers['broker-dealers'][elm] = compName         # build-up company CIK:Name dictionary

        # remove local file after it has been created
        os.remove('main.txt')
        
    return cik2brokers

In [None]:
dealerData(years=[2020], quarters=['QTR1'])

In [14]:
cik2brokers = dealerData(years=np.arange(1993, datetime.datetime.today().year+1))

[]

In [21]:
# # exporting RegisteredDealer information
# with open('CIKandDealers.txt', 'w') as file:
#     json.dump(cik2brokers, file)
#     file.close()

# write to a JSON file with accompanying meta information about coverage 


In [22]:
# df = pd.DataFrame.from_dict(cik2brokers, orient='index').reset_index()
# df.columns = ['CIK', 'COMPANY NAME']
# df.to_csv('CIKandDealers.csv', index=False)

In [8]:
dict(pd.read_csv('CIKandDealers.csv').values)

{356628: 'NATIONAL FINANCIAL SERVICES LLC ',
 815855: 'MERRILL LYNCH GOVERNMENT SECURITIES OF PUERTO RICO INC  ',
 895502: 'RBC CAPITAL MARKETS, LLC ',
 63868: 'JULIAN R. MCDERMOTT & CO. ',
 354231: 'CMS INVESTMENT RESOURCES, LLC ',
 1074467: 'CLB CAPITAL                                             ',
 225439: 'HELFANT LAWRENCE LLC                                    ',
 62674: 'WORLD FIRST FINANCIAL SERVICES, INC. ',
 857605: 'NBC INTERNATIONAL (USA) LTD. ',
 754114: 'NOBLE CAPITAL MARKETS, INC. ',
 1099542: 'RBR SECURITIES, INC. ',
 80869: '1717 CAPITAL MANAGEMENT COMPANY ',
 949918: '1ST DISCOUNT BROKERAGE, INC. ',
 1092955: '247MARKET COM                                           ',
 1023991: '2480 SECURITIES LLC ',
 895338: '401(K) INVESTMENT SERVICES, INC. ',
 740897: 'NOVA CAPITAL MARKETS, LLC ',
 1001379: 'A & M SECURITIES, LLC ',
 931969: 'A.B. FINANCIAL L.L.C. ',
 95085: 'A.B. WATLEY, INC. ',
 1053726: 'A. CH. SECURITIES, INC. ',
 205021: 'INVESCO DISTRIBUTORS, INC. ',
 846844

In [11]:
if __name__ == "__main__":
    
    # s3 active folder for outputs
    bucket = "ran-s3-systemic-risk"
    folder ="Input/X-17A-5/Output/"

    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    session = Session()

    paths = session.list_s3_files(bucket, folder)
    
    if 'CIKandDealers.json' in paths: 
        dealerData
    

1
