In [1]:
# console and directory access
import os
import re
import datetime

# data reading and exporting  
import json
import pandas as pd
import numpy as np

# parsing SEC website for data  
import requests 
from bs4 import BeautifulSoup

## Archived Dealer Data Import from 1993
Parses in dealer information with accompaning CIK code for EDGAR lookup from the SEC dealer registration

In [2]:
def companyName(cik) -> str:
    """
    Returns the company name for a given CIK number from the SEC by parsing the Edgar site
    e.g. https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231
    
    Input:
        :param: cik = e.g. 887767
    Output:
        :param: str = e.g. 1ST GLOBAL CAPITAL CORP. 
    """
    # establishing base-url for company name search
    baseURL = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&'
    url = baseURL+'CIK={}&type=X-17A-5&dateb=20201231'.format(cik)
    
    # response time for retrieving company names, returning beautifulsoup object
    res = requests.get(url, allow_redirects=True)
    s1 = BeautifulSoup(res.text, 'html.parser')
    
    # select the company information from the SEC website for a particular CIK
    for val in s1.find_all('span', attrs={"class":"companyName"}):
        # retrieve the company name from info class
        return val.text.split('CIK')[0].split('/BD')[0]

In [None]:
# additional archived data for company pre-2007
baseURL = 'https://www.sec.gov/Archives/edgar/full-index/'
years = np.arange(1993, datetime.datetime.today().year+1)
quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']

# initialize broker-dealer information for storage {key : value} -> {CIK : Company Name}
cik2brokers = {}

# construct a list of URLs to search through, point to archived search
archivedURLs = ['{}{}/{}/form.idx'.format(baseURL, year, qt) for year in years for qt in quarters]
print('Archived urls have been created and are stored')

# itterate through years and quarters for archival search
for searchURL in archivedURLs:
    print(searchURL)

    # send request to SEC website to retrieve broker dealer information 
    response = requests.get(searchURL, allow_redirects=True)
    
    # e.g. response data format retrieved from the archived set
    # Description:           Master Index of EDGAR Dissemination Feed by Company Name
    # Last Data Received:    March 31, 2010
    # Comments:              webmaster@sec.gov
    # Anonymous FTP:         ftp://ftp.sec.gov/edgar/
    #
    # Company Name      Form Type   CIK         Date Filed  File Name
    # -------------------------------------------------------------------------------------------------------
    # 'mktg, inc.'      10-K        886475      2010-01-20  edgar/data/886475/0001019056-10-000046.txt          
    # 'mktg, inc.'      10-Q        886475      2010-01-20  edgar/data/886475/0001019056-10-000047.txt          
    # 'mktg, inc.'      10-Q        886475      2010-01-20  edgar/data/886475/0001019056-10-000048.txt          

    # extract only main text from body, selecting terms below dashes '---' 
    data = response.text.split('---')[-1]   
    
    # write contents to a temporary file to read information
    with open('main.txt', 'w') as file: file.write(data)

    # convert text data to dataframe object and search for files
    df = pd.read_fwf('main.txt', header=None)
    cleanDf = df[~pd.isnull(df[0])]                          # strip away rows with NaN from the Form Type
    
    # check to see if first column contains information on X-17A-5 filings (use regex for X-17a flag)
    x17_check = cleanDf[0].str.contains('^x-17a', regex=True, flags=re.IGNORECASE)
    x17File = cleanDf[x17_check]

    # check whether X-17A-5 form type was found
    if not x17File.empty:
        # CIK number is taken from the last column of the rows splitting url string by row 
        # e.g. edgar/data/886475/0001019056-10-000046.txt -> 886475
        cikNumbers = x17File[x17File.columns[-1]].apply(lambda x: x.split('/')[2]).values
        
        # iterate through CIK elements  
        for elm in cikNumbers:
            compName = companyName(elm)         # company name for CIK retrieved from SEC
            cik2brokers[elm] = compName         # build-up company CIK:Name dictionary

    # remove local file after it has been created
    os.remove('main.txt')

Archived urls have been created and are stored
https://www.sec.gov/Archives/edgar/full-index/1993/QTR1/form.idx
https://www.sec.gov/Archives/edgar/full-index/1993/QTR2/form.idx
https://www.sec.gov/Archives/edgar/full-index/1993/QTR3/form.idx
https://www.sec.gov/Archives/edgar/full-index/1993/QTR4/form.idx
https://www.sec.gov/Archives/edgar/full-index/1994/QTR1/form.idx
https://www.sec.gov/Archives/edgar/full-index/1994/QTR2/form.idx
https://www.sec.gov/Archives/edgar/full-index/1994/QTR3/form.idx
https://www.sec.gov/Archives/edgar/full-index/1994/QTR4/form.idx
https://www.sec.gov/Archives/edgar/full-index/1995/QTR1/form.idx
https://www.sec.gov/Archives/edgar/full-index/1995/QTR2/form.idx
https://www.sec.gov/Archives/edgar/full-index/1995/QTR3/form.idx
https://www.sec.gov/Archives/edgar/full-index/1995/QTR4/form.idx
https://www.sec.gov/Archives/edgar/full-index/1996/QTR1/form.idx
https://www.sec.gov/Archives/edgar/full-index/1996/QTR2/form.idx
https://www.sec.gov/Archives/edgar/full-ind

In [None]:
# exporting RegisteredDealer information
with open('CIKandDealers.txt', 'w') as file:
    json.dump(cik2brokers, file)
    file.close()