In [1]:
import os
import re
import boto3
import itertools

import pandas as pd
import numpy as np
from sagemaker.session import Session

In [2]:
# create a darker background template (better for my eyes)
from jupyterthemes.stylefx import set_nb_theme
# set_nb_theme('chesterish')

In [3]:
def cleanNumeric(value) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
    """
    try:
        # some accounting formats take () to be negative numbers
        if value[0] == '(':
            value = '-' + value

        # perform regex operation scanning for only numeric quantities/identifiers
        cleanValue = re.sub("[^0-9|.|-]", "", value)
        try:
            return float(cleanValue)
        except ValueError:
            return 0.0
    except TypeError:
        return value

In [4]:
cleanNumeric('-$19,223')

-19223.0

In [5]:
def regexCheck(string:str, searchTerm:str):
    """
    Checks a regex expression for a given string construct
    string (type str)
        A string to perform a regex search on e.g. "Cash and cash equiavalents"
    searchTerm (type str)
        A regex expression to execute a search e.g. "^Cash"
        
    return: the searched term in question
    """
    try:
        s = re.search(searchTerm, string, flags=re.I)
        return s.string
    except AttributeError:
        return None

In [6]:
# sample = "Total Liabilities stockholder's equity"
sample = "MEMBER'S EQUITY"

In [7]:
regexCheck(sample, '^(?!.*liabilities).*equity$')

"MEMBER'S EQUITY"

In [8]:
def tMatch(totalA:float, totalLE:float):
    # helps match the accounting equation assets = libabilities + equities 
    try:
        multiplier = totalA / totalLE
        
        # if multiplier is zero, then total asset column is zero, we return nothing (imperfect)
        if multiplier == 0: return (None, 'Asset table is missing')
        
        # if multiplier is non-negative we accept only 1 or multiples of 10 (perfect and semi-perfect)
        elif (multiplier == 1) or (multiplier%10 == 0): return (multiplier, 'Perfect/Semi-Perfect match')
        
        # all other multipliers, should return nothing (number mismatch) 
        else: return (None, 'Accounting Error')
        
    # if total liabilites doesn't exist, but total assets either exists or does not exist  
    except ZeroDivisionError:
        return (None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")

In [9]:
print(tMatch(totalA=0, totalLE=100))
print(tMatch(totalA=0, totalLE=0))
print(tMatch(totalA=100, totalLE=0))
print(tMatch(totalA=100, totalLE=100))
print(tMatch(totalA=10000, totalLE=100))
print(tMatch(totalA=102020, totalLE=11))

(None, 'Asset table is missing')
(None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")
(None, "Either Liabilities/Equities doesn't exist or both Total Assets and Liabilities/Equities missing")
(1.0, 'Perfect/Semi-Perfect match')
(100.0, 'Perfect/Semi-Perfect match')
(None, 'Accounting Error')


In [10]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Output/BalanceSheet/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

In [11]:
print('Total cleaned .csv files total {}'.format(paths.size-1))

Total cleaned .csv files total 10713


## Test Case Example

In [12]:
selections = np.random.choice(paths[1:], 100)

In [13]:
index = 40

In [14]:
# retrieving downloaded files from s3 bucket
s3.download_file(bucket, selections[index], 'temp.pdf')

df = pd.read_csv('temp.pdf')

In [15]:
# complete .csv file read from the 
df

Unnamed: 0.1,Unnamed: 0,0,1
0,0,Cash,"$ 39,499"
1,1,Due from Clearing Broker,242638316
2,2,"Securities owned, at market value (Note 5)",38087111
3,3,JBO investment,50000
4,4,Equipment (net of accumulated depreciation of ...,151499
5,5,Leasehold improvements (net of accumulated amo...,31172
6,6,TOTAL ASSETS,"$ 280,997,597"
7,7,LIABILITIES AND MEMBERS' EQUITY,
8,8,Accounts payable and accrued expenses,"$ 128,392"
9,9,"Securities sold, not yet purchased, at market",243570454


In [16]:
# clean dataframes for only the items and their immediate values
cleanDf = df[df.columns[1:3]].dropna() 

In [17]:
cleanDf['1'] = cleanDf['1'].apply(cleanNumeric)

In [18]:
cleanDf

Unnamed: 0,0,1
0,Cash,39499.0
1,Due from Clearing Broker,242638316.0
2,"Securities owned, at market value (Note 5)",38087111.0
3,JBO investment,50000.0
4,Equipment (net of accumulated depreciation of ...,151499.0
5,Leasehold improvements (net of accumulated amo...,31172.0
6,TOTAL ASSETS,280997597.0
8,Accounts payable and accrued expenses,128392.0
9,"Securities sold, not yet purchased, at market",243570454.0
10,TOTAL LIABILITIES,243698846.0


In [19]:
print('Data for {}'.format(selections[index]))

Data for Output/BalanceSheet/1049854-03.csv


In [20]:
def extraction(df:pd.DataFrame, file:str) -> pd.DataFrame:
    """
    Extract specific line items from cleaned dataframes (balance sheets)
    :param: df (type pd.DataFrame)
        The dataframe from
    :param: file (type str)
        The file string stored in the s3 bucket
    
    :return: A dataframe subset with critical lines extracted
    """
    
    # regex expression for searching for line items 
    keySearch = ['^cash', 'deposits', 'receivable', '^total assets', 'payable', 'total liabilities$', 'common stock', 
                 'preferred stock', 'earnings', '^(?!.*liabilities).*equity$|^(?!.*liabilities).*equity:$', 
                 '^total liabilities.*equity$|^total liabilities.*equity:$']
    
    # key names reflecting the corresponding the regex names
    keyNames = ['Cash & Equivalents', 'Deposits', 'Receivables', 'Total Assets', 'Payables', 'Total Liabilities',
                'Common Stock', 'Preferred Stock', 'Earnings', 'Total Equity', 'Total Liabilities & Equity']
    
    # concat the list of dataframe for each extraction 
    series = {}
    
    # creating two rows to track the CIK and year information released
    cik, year = file.split('/')[-1].split('-')
    
    series['CIK'] = cik          # CIK number for firm 
    series['Year'] = year[:2]    # Year for firm filing  
    
    # select key for search names
    for i, key in enumerate(keySearch):
        # filter regex for corresponding string expression
        filterSet = df[df.columns[0]].str.contains(key, regex=True, flags=re.IGNORECASE)
        
        # matching names for search and constructing a row (dictionary form)
        # filter dataframe and sum corresponding data column
        series[keyNames[i]] = df[filterSet][df.columns[1]].sum()
  
    # match the T-table, asset/liabilities/equity for rows
    mul, msg = tMatch(series['Total Assets'], series['Total Liabilities & Equity'])
    
    if mul != None:
        series['Total Liabilities & Equity'] = series['Total Liabilities & Equity'] * mul
        
        # computing the series for other available line item
        series['Other Assets'] = series['Total Assets'] - series['Cash & Equivalents'] - series['Deposits'] - series['Receivables']
        series['Other Liabilities'] = series['Total Liabilities'] - series['Payables']
        series['Other Equity'] = series['Total Equity'] - series['Common Stock'] - series['Preferred Stock'] - series['Earnings']

        # convert the filtered dictionaries to a dataframe
        comboDF = pd.DataFrame.from_dict(series, orient='index')

        return comboDF.transpose()
    else:
        return msg

In [21]:
extraction(cleanDf, selections[index])

Unnamed: 0,CIK,Year,Cash & Equivalents,Deposits,Receivables,Total Assets,Payables,Total Liabilities,Common Stock,Preferred Stock,Earnings,Total Equity,Total Liabilities & Equity,Other Assets,Other Liabilities,Other Equity
0,1049854,3,39499,0,0,280998000.0,128392,243699000.0,0,0,0,37298800.0,280998000.0,280958000.0,243570000.0,37298800.0


## Database Construction

In [151]:
itters = []
error = {}

for file in paths[1:]:
    # retrieving downloaded files from s3 bucket
    s3.download_file(bucket, file, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # clean dataframes for only the items and their immediate values
    cleanDF = df[df.columns[1:3]].dropna() 
    
    if len(cleanDF.columns) > 1:
        # convert each string item to a numeric quantity
        cleanDF[cleanDF.columns[1]] = cleanDF[cleanDF.columns[1]].apply(cleanNumeric)

        # export data extraction from dataframe
        tempDF = extraction(cleanDF, file)

        if tempDF is not None and type(tempDF) is not str:
            itters.append(tempDF)
        else:
            error[file] = tempDF
        
    # remove local file after it has been created
    os.remove('temp.pdf')

In [150]:
cleanDF

Unnamed: 0,0
0,2003 $
1,ASSETS
2,"Cash 360,065"
3,"Due from carrying broker [note 3] 335,986"
4,"Other assets 51,614"
5,747665
6,LIABILITIES AND STOCKHOLDER'S EQUITY
7,"Accounts payable and accrued liabilities 11,408"
8,"Due to parent [note 3] 154,162"
9,"Total liabilities 165,570"


In [None]:
# # export database built to .csv file
# pd.concat(itters).to_csv('sample.csv', index=False)

In [152]:
pd.concat(itters)

Unnamed: 0,CIK,Year,Cash & Equivalents,Deposits,Receivables,Total Assets,Payables,Total Liabilities,Common Stock,Preferred Stock,Earnings,Total Equity,Total Liabilities & Equity,Other Assets,Other Liabilities,Other Equity
0,1000147,11,45335,0,0,842675,1337,0,0,0,0,838419,842675,797340,-1337,838419
0,1000148,09,0,0,432645,432645,0,0,0,0,424645,432645,432645,0,0,8000
0,1000148,10,0,0,155593,155593,0,0,0,0,147593,155593,155593,0,0,8000
0,1000148,11,0,0,25627,25627,0,0,0,0,17627,25627,25627,0,0,8000
0,1000148,12,0,0,25640,25640,0,0,0,0,17640,25640,25640,0,0,8000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,1079505,15,1.24053e+07,0,349742,9.41832e+06,225000,380247,0,0,6.58825e+06,9.03807e+06,9.41832e+06,-3.33671e+06,155247,2.44982e+06
0,1079505,16,1.0374e+07,0,630822,8.63538e+06,294411,339527,0,0,5.86842e+06,8.29585e+06,8.63538e+06,-2.36943e+06,45116,2.42744e+06
0,1079505,17,6.28404e+06,0,1.08568e+06,7.09138e+06,242644,353725,0,0,4.31022e+06,6.73766e+06,7.09138e+06,-278337,111081,2.42744e+06
0,1079505,18,5.82328e+06,0,1.08418e+06,5.46991e+06,165623,206766,0,0,2.83571e+06,5.26314e+06,5.46991e+06,-1.43756e+06,41143,2.42744e+06


In [153]:
error

{'Output/BalanceSheet/1000147-02.csv': 'Accounting Error',
 'Output/BalanceSheet/1000147-03.csv': 'Accounting Error',
 'Output/BalanceSheet/1000147-04.csv': 'Accounting Error',
 'Output/BalanceSheet/1000147-05.csv': 'Accounting Error',
 'Output/BalanceSheet/1000147-06.csv': 'Accounting Error',
 'Output/BalanceSheet/1000147-07.csv': 'Accounting Error',
 'Output/BalanceSheet/1000147-08.csv': 'Accounting Error',
 'Output/BalanceSheet/1000147-09.csv': 'Accounting Error',
 'Output/BalanceSheet/1000147-10.csv': 'Accounting Error',
 'Output/BalanceSheet/1000148-02.csv': 'Accounting Error',
 'Output/BalanceSheet/1000148-03.csv': 'Accounting Error',
 'Output/BalanceSheet/1000148-04.csv': 'Accounting Error',
 'Output/BalanceSheet/1000148-05.csv': 'Accounting Error',
 'Output/BalanceSheet/1000148-06.csv': 'Accounting Error',
 'Output/BalanceSheet/1000148-07.csv': 'Accounting Error',
 'Output/BalanceSheet/1000148-08.csv': 'Accounting Error',
 'Output/BalanceSheet/1000151-11.csv': 'Asset table is m

In [156]:
print('Percent error from sample database construction is {}%'.format(100*(len(error) / len(paths[1:]))))

Percent error from sample database construction is 47.68062397372742%


## Constructing an Unstructured Database
### Retrieving all Asset and Liabilities & Equity line items 

In [115]:
def balanceSheetLines(fileNames:np.ndarray) -> tuple:
    """
    Retrieving balance sheet information line item names for s3 files
    :param: (type np.ndarray)
        An array of file names for .csv files from s3
        
    :return: (type tuple)
        Return a tuple of arrays; left is the asset items, and right is the liability & equity items 
    """
    
    Asset = []
    LiabilityandEquity = []

    # iterate through files 
    for file in fileNames:
        # download temporary file from s3 bucket
        s3.download_file(bucket, file, 'temp.pdf')
        df = pd.read_csv('temp.pdf')
        
        n = df.columns.size   # the number of columns
        
        if n > 1: # if there are more than 1 column we continue
            arr = df[df.columns[1]].dropna().values     # all line item names for balance sheet info
            
            for i, item in enumerate(arr):
                # search for val check for liabilties, name
                val = re.search('liabilities', item, flags=re.I)

                if val is not None:
                    # split the asset and liability portions of key line items
                    Asset.append(arr[:i])
                    LiabilityandEquity.append(arr[i:])
                    break
                    
        # remove local file after it has been created
        os.remove('temp.pdf')
    
    # flatten series for all values
    left = np.hstack(np.array(Asset))
    right = np.hstack(np.array(LiabilityandEquity))
    
    return (left, right)

In [None]:
leftSide, rightSide = balanceSheetLines(paths[1:])

In [None]:
# storing unique list of asset times and liability line items
with open('assetLines.txt', 'w') as f: f.write(repr(set(leftSide)).split('{')[1].split('}')[0])
with open('liabilityLines.txt', 'w') as f: f.write(repr(set(leftSide)).split('{')[1].split('}')[0])

print('All line items have been stored')