In [25]:
import os
import re
import boto3
import json
import requests

import pandas as pd
import numpy as np

from sagemaker.session import Session
from bs4 import BeautifulSoup

In [26]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder = "Output/X-17A-5-BS/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

In [27]:
def cleanNumeric(value) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
        
    """
    
    assert type(value) is str or int or np.ndarray, 'Value must be of type string, integer, float or numpy array'
    
    # checks to see what type of value is being provided
    operator = type(value)
    
    def num_strip(number):
        """
        Nested function for extracting numerical quantities
        """
        try:
            # some accounting formats take () to be negative numbers
            if number[0] == '(':
                number = '-' + number

            # perform regex operation scanning for only numeric quantities/identifiers
            cleanValue = re.sub("[^0-9|.|-]", "", number)

            # last check against poor lagging formats e.g. "." or "-" to return nan or floating-point number
            try: 
                return float(cleanValue)
            except ValueError: 
                return np.nan
            
        except (TypeError, IndexError):
            return np.nan
    
    # if provided a string, perform regex operation 
    if (operator is str) and (len(value) > 0):
        return num_strip(value)
    
    # if operator is integer then simply return the value, no need to modify 
    elif (operator is int):
        return value
    
    # if operator is numpy array then we perform a extraction per element in array
    elif (operator is np.ndarray):
        vFunc = np.vectorize(num_strip)      # vectorize function to apply to numpy array
        cleanValue = vFunc(value)            # apply vector function
        return cleanValue
    
    else:
        return np.nan

In [28]:
def companyName(cik) -> str:
    """
    Returns the company name for a given CIK number from the SEC by parsing the Edgar site
    e.g. https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=1904&type=X-17A-5&dateb=20201231
    
    Input:
        :param: cik (type str)
            The CIK number for a broker dealer e.g. 887767
    Return:
        :param: (type str)
            Returns the accompanying name with the CIK provided e.g. 1ST GLOBAL CAPITAL CORP. 
    """
    # establishing base-url for company name search
    baseURL = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&'
    url = baseURL+'CIK={}&type=X-17A-5&dateb=20201231'.format(cik)
    
    # response time for retrieving company names, returning beautifulsoup object
    res = requests.get(url, allow_redirects=True)
    s1 = BeautifulSoup(res.text, 'html.parser')
    
    # select the company information from the SEC website for a particular CIK
    for val in s1.find_all('span', attrs={"class":"companyName"}):
        # retrieve the company name from info class
        return val.text.split('CIK')[0].split('/BD')[0]

In [29]:
with open('assetLines.txt', 'r') as f: assetSide = json.loads(f.read())
with open('liabilityLines.txt', 'r') as f: liableSide = json.loads(f.read())

In [30]:
assetDict = dict([(i, np.nan) for i in assetSide])
liableDict = dict([(i, np.nan) for i in liableSide])

In [31]:
def unstructured_data(filepaths, lineDictionary, lineItems) -> pd.DataFrame:
    """
    Forms unstructured data frame from .csv file(s) located in s3 bucket
    
    :param: filepaths
        filepaths from s3 that store .csv file(s) (/Output/BalanceSheet/)
    :paran: lineDictionary
        dictionary of total unstructured line items and corresponding values
    :paran: lineItems
        list of line items (asset/liabilites) that will be searched for
        
    :return: tuple
        Returns a tuple, itters is a list of dataframes representing line items, error is a dictionary
        tracking all error terms that exist due to univariate dataframes 
    """
    
    # intialize list to store dataframes and errors
    itters = [0] * filepaths.size
    
    # iterate through files from s3 
    for i, file in enumerate(filepaths):
        
        # create temporary dictionary copy for storage of values
        tempDict = lineDictionary.copy()
        
        # creating two rows to track the CIK and year information released
        cik, year = file.split('/')[-1].split('-')
        
        tempDict['CIK'] = cik                 # CIK number for firm 
        tempDict['Year'] = year[:4]           # Year for firm filing  
        tempDict['Name'] = companyName(cik)   # returns the name of associated with the CIK
        
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, file, 'temp.pdf')
        df = pd.read_csv('temp.pdf')
        
        # clean dataframe should be of size greater than 1
        if len(df.columns) > 1:
            
            # extract line items from each dataframe (balance sheet)
            lines = df[df.columns[0]]
            
            # filter dataframes according line items, and extract numerical values from dataframe 
            filterDF = df[np.isin(lines, lineItems)]
            filterDF = filterDF.set_index(filterDF.columns[0])             # set line items as index
            filterDF = filterDF.apply(lambda x: cleanNumeric(x.values))    # convert string values to numerical figures
            
            # iterate through items from (asset or liability items)
            for item in filterDF.index:
                lineVal = filterDF.loc[item]                 # line item e.g. Cash $72,343 $71,231
                
                # check to see scope of line value, checking if multi-rows present (Type DataFrame)
                # in the event we have repeating 'item' lines (e.g. 2 Prepaid expense categories) we sum columns  
                if type(lineVal) is not pd.Series:
                    lineVal = lineVal.sum()
                
                value = lineVal.iloc[0]                      # first column value e.g. 72343, either singular or sum 
                
                # value of line items for the adjacent column (current year)
                # some dataframes have multiple year releases (e.g. FY 2020, FY 2019)
                if ~np.isnan(value):
                    tempDict[item] = value
                    
                else:
                    try:
                        # if the first column is blank we assume the second column is filled with totals
                        value = lineVal.iloc[1]
                        
                        # if second column value is not-nan we attach those values
                        if ~np.isnan(value):
                            tempDict[item] = value
                    
                    # if no second column exists, we ignore and pass
                    except IndexError: pass
            
            # convert the dictionary values to dataframe for database construction 
            row = pd.DataFrame.from_dict(tempDict, orient='index')
            
            # append dataframe set to array transposing 
            itters[i] = row.T
            
        else:
            print('{} - encountered issue reading PDF'.format(file))
        
        # remove local file after it has been created
        os.remove('temp.pdf')
    
    return pd.concat(itters)

In [32]:
def dfBuild(df:pd.DataFrame) -> pd.DataFrame:
    # re-order the CIK and Year columns to appear as the first two columns
    remap = df.columns[~np.isin(df.columns, ['CIK', 'Name', 'Year'])]
    df = df[np.insert(remap, [0, 0, 0], ['CIK', 'Name', 'Year'])]

    # filter out columns with NaN values, return only values
    filterNaN = df.isnull().all()
    cleanCols = filterNaN[filterNaN == False].index

    # clean dataframe for unstructured asset terms
    return df[cleanCols]

In [33]:
# unstructured data table from all .csv files 
assetItters = unstructured_data(paths[1:], assetDict, assetSide)
assetDF = dfBuild(assetItters)

In [34]:
assetDF.to_csv('unstructAsset.csv', index=False)

In [35]:
# unstructured data table from all .csv files 
liableItters = unstructured_data(paths[1:], liableDict, liableSide)
liableDF = dfBuild(liableItters)

In [36]:
liableDF.to_csv('unstructLiable.csv', index=False)