In [1]:
import os
import re
import boto3
import itertools

import pandas as pd
import numpy as np
from sagemaker.session import Session

In [22]:
# create a darker background template (better for my eyes)
from jupyterthemes.stylefx import set_nb_theme
# set_nb_theme('chesterish')

In [58]:
def cleanNumeric(value) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
    """
    try:
        # some accounting formats take () to be negative numbers
        if value[0] == '(':
            value = '-' + value

        # perform regex operation scanning for only numeric quantities/identifiers
        cleanValue = re.sub("[^0-9|.|-]", "", value)
        try:
            return float(cleanValue)
        except ValueError:
            return 0.0
    except TypeError:
        return value

In [4]:
cleanNumeric('-$19,223')

-19223.0

In [5]:
def regexCheck(string:str, searchTerm:str):
    """
    Checks a regex expression for a given string construct
    string (type str)
        A string to perform a regex search on e.g. "Cash and cash equiavalents"
    searchTerm (type str)
        A regex expression to execute a search e.g. "^Cash"
        
    return: the searched term in question
    """
    try:
        s = re.search(searchTerm, string, flags=re.I)
        return s.string
    except AttributeError:
        return None

In [6]:
# sample = "Total Liabilities stockholder's equity"
sample = "MEMBER'S EQUITY"

In [7]:
regexCheck(sample, '^(?!.*liabilities).*equity$')

"MEMBER'S EQUITY"

In [8]:
# initiate s3 bucket and corresponding data folder
# bucket = "ran-s3-systemic-risk"
# data_folder ="Input/X-17A-5/"

bucket = "ran-s3-systemic-risk"
data_folder ="Output/BalanceSheet/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

In [9]:
'Total cleaned .csv files total {}'.format(paths.size)

'Total cleaned .csv files total 9745'

In [10]:
selections = np.random.choice(paths[1:], 100)

In [11]:
index = 40

In [12]:
# retrieving downloaded files from s3 bucket
s3.download_file(bucket, selections[index], 'temp.pdf')

df = pd.read_csv('temp.pdf')

In [13]:
df

Unnamed: 0.1,Unnamed: 0,0,1,2
0,0,,December,31
1,1,,2017,2016
2,2,,,
3,3,Assets:,,
4,4,Cash on deposit with affiliated company,"$ 2,922,359","$ 2,830,207"
5,5,Investment in money market mutual fund,75042914,64917222
6,6,Commissions receivable from clearing correspon...,72707,35187
7,7,Other receivables,7786079,6443100
8,8,Deposit with clearing correspondent,250000,250000
9,9,Goodwill,3493211,3493211


In [14]:
# clean dataframes for only the items and their immediate values
cleanDf = df[df.columns[1:3]].dropna() 

In [15]:
cleanDf['1'] = cleanDf['1'].apply(cleanNumeric)

In [16]:
cleanDf

Unnamed: 0,0,1
4,Cash on deposit with affiliated company,2922359.0
5,Investment in money market mutual fund,75042914.0
6,Commissions receivable from clearing correspon...,72707.0
7,Other receivables,7786079.0
8,Deposit with clearing correspondent,250000.0
9,Goodwill,3493211.0
10,Due from affiliates,720526.0
11,Deferred tax asset,175859.0
12,Total assets,90463655.0
14,Commissions payable to brokers and dealers,6949101.0


In [17]:
print('Data for {}\n'.format(selections[index]))

Data for Output/BalanceSheet/1048281-18.csv



In [56]:
def tMatch(totalA:float, totalLE:float):
    # helps match the accounting equation assets = libabilities + equities 
    try:
        multiplier = totalA / totalLE
        
        if multiplier == 0: return None
        elif (multiplier == 1) or (multiplier%10 == 0): return multiplier
        else multiplier%10 != 0: None}
        
    # if total liabilites doesn't exist, but total assets either exists or does not exist  
    except ZeroDivisionError:
        return None

In [19]:
def extraction(df:pd.DataFrame, file:str) -> pd.DataFrame:
    """
    Extract specific line items from cleaned dataframes (balance sheets)
    :param: df (type pd.DataFrame)
        The dataframe from
    :param: file (type str)
        The file string stored in the s3 bucket
    
    :return: A dataframe subset with critical lines extracted
    """
    
    # regex expression for searching for line items 
    keySearch = ['^cash', 'deposits', 'receivable', '^total assets', 'payable', 'total liabilities$', 'common stock', 
                 'preferred stock', 'earnings', '^(?!.*liabilities).*equity$|^(?!.*liabilities).*equity:$', 
                 '^total liabilities.*equity$|^total liabilities.*equity:$']
    
    # key names reflecting the corresponding the regex names
    keyNames = ['Cash & Equivalents', 'Deposits', 'Receivables', 'Total Assets', 'Payables', 'Total Liabilities',
                'Common Stock', 'Preferred Stock', 'Earnings', 'Total Equity', 'Total Liabilities & Equity']
    
    # concat the list of dataframe for each extraction 
    series = {}
    
    # creating two rows to track the CIK and year information released
    cik, year = file.split('/')[-1].split('-')
    
    series['CIK'] = cik          # CIK number for firm 
    series['Year'] = year[:2]    # Year for firm filing  
    
    # select key for search names
    for i, key in enumerate(keySearch):
        # filter regex for corresponding string expression
        filterSet = df[df.columns[0]].str.contains(key, regex=True, flags=re.IGNORECASE)
        
        # matching names for search and constructing a row (dictionary form)
        # filter dataframe and sum corresponding data column
        series[keyNames[i]] = df[filterSet][df.columns[1]].sum()
  
    # match the T-table, asset/liabilities/equity for rows
    mul = tMatch(series['Total Assets'], series['Total Liabilities & Equity'])
    
    if mul != None:
        series['Total Liabilities & Equity'] = series['Total Liabilities & Equity'] * mul
        
        # computing the series for other available line item
        series['Other Assets'] = series['Total Assets'] - series['Cash & Equivalents'] - series['Deposits'] - series['Receivables']
        series['Other Liabilities'] = series['Total Liabilities'] - series['Payables']
        series['Other Equity'] = series['Total Equity'] - series['Common Stock'] - series['Preferred Stock'] - series['Earnings']

        # convert the filtered dictionaries to a dataframe
        comboDF = pd.DataFrame.from_dict(series, orient='index')

        return comboDF.transpose()
    

In [20]:
extraction(cleanDf, selections[index])

{'CIK': '1048281', 'Year': '18', 'Cash & Equivalents': 2922359.0, 'Deposits': 0.0, 'Receivables': 7858786.0, 'Total Assets': 90463655.0, 'Payables': 6949101.0, 'Total Liabilities': 8624389.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 73397789.0, 'Total Equity': 81839266.0, 'Total Liabilities & Equity': 90463655.0} 



Unnamed: 0,CIK,Year,Cash & Equivalents,Deposits,Receivables,Total Assets,Payables,Total Liabilities,Common Stock,Preferred Stock,Earnings,Total Equity,Total Liabilities & Equity,Other Assets,Other Liabilities,Other Equity
0,1048281,18,2922360.0,0,7858790.0,90463700.0,6949100.0,8624390.0,0,0,73397800.0,81839300.0,90463700.0,79682500.0,1675290.0,8441480.0


In [32]:
itters = []
error = 0

for file in selections:
    # retrieving downloaded files from s3 bucket
    s3.download_file(bucket, file, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # clean dataframes for only the items and their immediate values
    cleanDF = df[df.columns[1:3]].dropna() 
    
    # convert each string item to a numeric quantity
    cleanDF[cleanDF.columns[1]] = cleanDF[cleanDF.columns[1]].apply(cleanNumeric)
    
    # export data extraction from dataframe
    tempDF = extraction(cleanDF, file)
    
    if tempDF is not None:
        itters.append(tempDF)
    else:
        print(file)
        error += 1
        
    # remove local file after it has been created
    os.remove('temp.pdf')

Output/BalanceSheet/1052641-07.csv
{'CIK': '1052641', 'Year': '07', 'Cash & Equivalents': 0.0, 'Deposits': 0.0, 'Receivables': 0.0, 'Total Assets': 0.0, 'Payables': 14800.0, 'Total Liabilities': 0.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': -2456359.0, 'Total Equity': 0.0, 'Total Liabilities & Equity': 0.0} 

Output/BalanceSheet/1005742-02.csv
{'CIK': '1005742', 'Year': '02', 'Cash & Equivalents': 0.0, 'Deposits': 0.0, 'Receivables': 0.0, 'Total Assets': 0.0, 'Payables': 7584.0, 'Total Liabilities': 0.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 31.0, 'Total Equity': 0.0, 'Total Liabilities & Equity': 0.0} 

Output/BalanceSheet/1007497-04.csv
{'CIK': '1007497', 'Year': '04', 'Cash & Equivalents': 11081.0, 'Deposits': 0.0, 'Receivables': 2081.0, 'Total Assets': 18637.0, 'Payables': 1561.0, 'Total Liabilities': 1561.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 0.0, 'Total Equity': 0.0, 'Total Liabilities & Equity': 0.0} 

Output/BalanceSheet/

Output/BalanceSheet/1001366-07.csv
{'CIK': '1001366', 'Year': '07', 'Cash & Equivalents': 147150.0, 'Deposits': 0.0, 'Receivables': 319520.0, 'Total Assets': 555719.0, 'Payables': 107488.0, 'Total Liabilities': 107488.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 274832.0, 'Total Equity': 448231.0, 'Total Liabilities & Equity': 555719.0} 

Output/BalanceSheet/1004468-10.csv
{'CIK': '1004468', 'Year': '10', 'Cash & Equivalents': 0.0, 'Deposits': 0.0, 'Receivables': 0.0, 'Total Assets': 0.0, 'Payables': 110.0, 'Total Liabilities': 0.0, 'Common Stock': 800.0, 'Preferred Stock': 0.0, 'Earnings': 1995.0, 'Total Equity': 9995.0, 'Total Liabilities & Equity': 10105.0} 

Output/BalanceSheet/1026209-03.csv
{'CIK': '1026209', 'Year': '03', 'Cash & Equivalents': 31996.0, 'Deposits': 0.0, 'Receivables': 22417.0, 'Total Assets': 69161.0, 'Payables': 1126.0, 'Total Liabilities': 0.0, 'Common Stock': 100.0, 'Preferred Stock': 0.0, 'Earnings': 274703.0, 'Total Equity': 66165.0, 'Total Li

Output/BalanceSheet/1049770-10.csv
{'CIK': '1049770', 'Year': '10', 'Cash & Equivalents': 68694.0, 'Deposits': 8219.0, 'Receivables': 28217.0, 'Total Assets': 0.0, 'Payables': 5000.0, 'Total Liabilities': 0.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 75665.0, 'Total Equity': 0.0, 'Total Liabilities & Equity': 0.0} 

Output/BalanceSheet/1071392-03.csv
{'CIK': '1071392', 'Year': '03', 'Cash & Equivalents': 193138.0, 'Deposits': 0.0, 'Receivables': 36350.0, 'Total Assets': 307569.0, 'Payables': 51535.0, 'Total Liabilities': 57321.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 0.0, 'Total Equity': 250248.0, 'Total Liabilities & Equity': 307569.0} 

Output/BalanceSheet/1013851-06.csv
{'CIK': '1013851', 'Year': '06', 'Cash & Equivalents': 152081.0, 'Deposits': 0.0, 'Receivables': 46445.0, 'Total Assets': 567177.0, 'Payables': 10141.0, 'Total Liabilities': 20725.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 154956.0, 'Total Equity': 546452.0, 'Total 

TypeError: 'float' object is not subscriptable

In [23]:
# # export database built to .csv file
# pd.concat(itters).to_csv('sample.csv', index=False)

In [42]:
pd.concat(itters)

Unnamed: 0,CIK,Year,Cash & Equivalents,Deposits,Receivables,Total Assets,Payables,Total Liabilities,Common Stock,Preferred Stock,Earnings,Total Equity,Total Liabilities & Equity,Other Assets,Other Liabilities,Other Equity
0,1021474,4,14933.0,10000,13113.0,51552.0,5463.0,5463.0,0,0,0.0,46089.0,51552.0,13506.0,0.0,46089.0
0,1007779,3,10284.0,0,0.0,0.0,0.0,0.0,0,0,0.0,10284.0,0.0,-10284.0,0.0,10284.0
0,1076676,18,21133.0,0,0.0,64365.0,15000.0,0.0,0,0,-25635.0,49365.0,64365.0,43232.0,-15000.0,75000.0
0,1021958,2,0.0,0,0.0,0.0,0.0,0.0,0,0,101658.0,0.0,0.0,0.0,0.0,-101658.0
0,1060237,15,10675200.0,0,0.0,12183600.0,234057.0,1422570.0,0,0,8011160.0,10761100.0,12183600.0,1508390.0,1188510.0,2749910.0
0,1056199,7,10493.0,0,0.0,14493.0,0.0,0.0,0,0,0.0,0.0,14493.0,4000.0,0.0,0.0
0,1050649,12,8496.0,0,1500.0,12424.0,0.0,0.0,0,0,0.0,11583.0,12424.0,2428.0,0.0,11583.0
0,1028562,5,5148.0,0,46409.0,70353.0,43234.0,43234.0,0,0,0.0,27119.0,70353.0,18796.0,0.0,27119.0
0,1038993,10,5454870.0,145011,2720920.0,8508220.0,6893670.0,6968670.0,0,0,0.0,1494550.0,8508220.0,187416.0,75000.0,1494550.0
0,1070092,14,18367500.0,750006,479809.0,141869000.0,16557700.0,121165000.0,10,0,9724460.0,20704100.0,141869000.0,122272000.0,104608000.0,10979700.0


In [27]:
error / len(selections)

0.2

In [59]:
a = df[df.columns[1:3]].dropna() 

In [60]:
a

Unnamed: 0,0,1
11,1. from outsiders*g $ 2. includes equity subor...,970.0
12,of $,980.0
13,"B. Securities borrowings, at market value from...",990.0
15,1. from outsiders $,1000.0
16,2. includes equity subordination (15c3-1(d)) of $,1010.0


In [62]:
a[a.columns[1]].apply(cleanNumeric)

11     970.0
12     980.0
13     990.0
15    1000.0
16    1010.0
Name: 1, dtype: float64