In [1]:
# Run on first instance to install required libraries
%pip install PyPDF2
%pip install bs4

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# console and directory access
import os
import re
import time 
import json
import urllib
import datetime

# interacting with Amazon AWS
import boto3
from sagemaker.session import Session

# data reading and exporting  
import pandas as pd
import numpy as np

# parsing SEC website for data  
import requests
from bs4 import BeautifulSoup

# pdf manipulation
from PyPDF2 import PdfFileReader, PdfFileWriter, utils

## CIK Extraction from RSSD Company Names
Extract URL links per companies as stored from the RSSD

In [52]:
def baseURL(company_name:str) -> str:
    """
    Constructs a base URL for searching for a paritcular SEC filing  
    ------------------------------------------------------------------------------------------
    Input:
        :param: cik (type str)
            The CIK number for a registreed broker-dealer (e.g. 1904)
        :param: file_type (type str)
            The file type looking to parse for a given broker-dealer (e.g. default X-17A-5)
            
    Return:
        :param: url (type str)
            A URL string that points to the EDGAR webpage of a registred broker dealer
            (e.g. https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231)
    """
    
    # forming the SEC search URLs from the select CIK, file type and date range
    secFormat = 'https://www.sec.gov/cgi-bin/browse-edgar?company='     # SEC base url
    comp_name = '+'.join(company_name.split(' '))                       # company name in SEC format
    selection = '&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany'    

    # build lookup URLs for the SEC level data (base url)
    url = secFormat + comp_name + selection
    
    return url

In [68]:
def cikParse(url:str, company_name:str) -> str:
    """
    Parses the CIK webpage of a provided URL and returns a tuple of arrays/lists
    ------------------------------------------------------------------------------------------
    Input:
        :param: url (type str) 
            URL is a string representing a SEC website URL pointing to a CIK for X-17A-5 filings
            e.g. https://www.sec.gov/cgi-bin/browse-edgar?company=BEAR+STEARNS+ASSET+BACKED+SECURITIES+I+LLC&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany
        :param: company_name (type str) 
            The company name as represented from RSSD terms
    Return:
        :param: cik (type numpy array)
            
    """
    
    # read in HTML tables from the url link provided 
    try:
        # due to web-scrapping non-constant behavior (check against 100 tries)
        for _ in range(100):
            try: 
                filings = pd.read_html(url) 
                break
            except urllib.error.HTTPError: pass
            
            # if no table found then we return None
            except ValueError: return None
        
        # we are interested in the first table present within the html read
        filing_table = filings[0]    
        
        try:
            # a basic format for identifying companies
            filing_table['Company'] = filing_table['Company'].apply(lambda x: x.split('SIC')[0]).values   
            
            # return a the CIK that correspond to that company name 
            cik = filing_table[filing_table.Company == company_name].CIK.iloc[0]
            return cik
            
        except KeyError: 
            # a standard format used for storing filings
            
            # will need to request response form HTML server and then extract the CIK link from the href that exists
            # https://www.sec.gov/cgi-bin/browse-edgar?company=PEOPLES+STATE+BANK&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany
        
    # if there exists no active reports for a given CIK, we flag the error
    except IndexError:
        print('no CIK was present to match this firm\n')
        return None

## Main File Execution

In [74]:
if __name__ == "__main__":
    
    ffiec_names = pd.read_csv('FFIEC-NAMES.csv')
    name2cik = {}
    
    for name in ffiec_names.values:
        print('\nChecking for a CIK for {}'.format(name[0]))
        
        # compute the url for search and retrieve the accompanying CIK
        url = baseURL(name[0])
        print('\tcurrent URL is {}'.format(url))
        cik_num = cikParse(url, name[0])
        
        if cik_num is not None:
            # extend dictionary if a CIK mapping is present, report None (otherwise)
            name2cik[name[0]] = cik_num
        else:
            print('\tno CIK was found')


Checking for a CIK for BANK OF HANCOCK COUNTY
	current URL is https://www.sec.gov/cgi-bin/browse-edgar?company=BANK+OF+HANCOCK+COUNTY&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany
	no CIK was found

Checking for a CIK for UTILITY EMPLOYEES FEDERAL CREDIT UNION
	current URL is https://www.sec.gov/cgi-bin/browse-edgar?company=UTILITY+EMPLOYEES+FEDERAL+CREDIT+UNION&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany
	no CIK was found

Checking for a CIK for FIRST COMMUNITY BANK XENIA-FLORA
	current URL is https://www.sec.gov/cgi-bin/browse-edgar?company=FIRST+COMMUNITY+BANK+XENIA-FLORA&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany
	no CIK was found

Checking for a CIK for MINEOLA COMMUNITY BANK, SSB
	current URL is https://www.sec.gov/cgi-bin/browse-edgar?company=MINEOLA+COMMUNITY+BANK,+SSB&match=&filenum=&State=&Country=&SIC=&myowner=exclude&action=getcompany
	no CIK was found

Checking for a CIK for BISON STATE BANK
	

KeyError: 'Company'