In [1]:
import os
import re
import boto3
import itertools

import pandas as pd
import numpy as np
from sagemaker.session import Session

In [254]:
# create a darker background template (better for my eyes)
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('chesterish')

In [261]:
def cleanNumeric(value:str) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
    """
    # some accounting formats take () to be negative numbers
    if value[0] == '(':
        value = '-' + value

    # perform regex operation scanning for only numeric quantities/identifiers
    cleanValue = re.sub("[^0-9|.|-]", "", value)
    try:
        return float(cleanValue)
    except ValueError:
        return 0.0

In [36]:
cleanNumeric('-$19,223')

-19223.0

In [9]:
def regexCheck(string:str, searchTerm:str):
    """
    Checks a regex expression for a given string construct
    string (type str)
        A string to perform a regex search on e.g. "Cash and cash equiavalents"
    searchTerm (type str)
        A regex expression to execute a search e.g. "^Cash"
        
    return: the searched term in question
    """
    try:
        s = re.search(searchTerm, string, flags=re.I)
        return s.string
    except AttributeError:
        return None

In [114]:
# sample = "Total Liabilities stockholder's equity"
sample = "MEMBER'S EQUITY"

In [115]:
regexCheck(sample, '^(?!.*liabilities).*equity$')

"MEMBER'S EQUITY"

In [2]:
# initiate s3 bucket and corresponding data folder
# bucket = "ran-s3-systemic-risk"
# data_folder ="Input/X-17A-5/"

bucket = "ran-s3-systemic-risk"
data_folder ="Output/BalanceSheet/"

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

In [3]:
selections = np.random.choice(paths[1:], 100)

In [13]:
index = 40

In [167]:
# retrieving downloaded files from s3 bucket
s3.download_file(bucket, selections[index], 'temp.pdf')

df = pd.read_csv('temp.pdf')

In [168]:
df

Unnamed: 0.1,Unnamed: 0,0,1
0,0,Cash and cash equivalents,"$ 91,706"
1,1,Deposits with clearing organizations and others,77797
2,2,Receivables from clearing organizations,140724
3,3,Receivables from broker or dealers,20281
4,4,Other receivables,3000
5,5,Receivable from related parties,120608
6,6,Other assets,17564
7,7,"Furniture, equipment and leasehold improvement...",32313
8,8,Total assets,"$ 503,993"
9,9,LIABILITIES AND MEMBER'S EQUITY,


In [169]:
# clean dataframes for only the items and their immediate values
cleanDf = df[df.columns[1:3]].dropna() 

In [170]:
cleanDf['1'] = cleanDf['1'].apply(cleanNumeric)

In [171]:
cleanDf

Unnamed: 0,0,1
0,Cash and cash equivalents,91706.0
1,Deposits with clearing organizations and others,77797.0
2,Receivables from clearing organizations,140724.0
3,Receivables from broker or dealers,20281.0
4,Other receivables,3000.0
5,Receivable from related parties,120608.0
6,Other assets,17564.0
7,"Furniture, equipment and leasehold improvement...",32313.0
8,Total assets,503993.0
11,Payables to broker-dealers,37171.0


In [22]:
print('Data for {}\n'.format(selections[index]))

Data for Output/BalanceSheet/1070671-03.csv



In [200]:
def tMatch(leftSide:float, rightSide:float):
    # helps match the accounting equation assets = libabilities + equities 
    multiplier = leftSide / rightSide
    
    if multiplier%10 == 0:
        return multiplier
    elif multiplier == 1:
        return 1
    else:
        None

In [258]:
def extraction(df:pd.DataFrame, file:str) -> pd.DataFrame:
    """
    Extract specific line items from cleaned dataframes (balance sheets)
    :param: df (type pd.DataFrame)
        The dataframe from
    :param: file (type str)
        The file string stored in the s3 bucket
    
    :return: A dataframe subset with critical lines extracted
    """
    
    # regex expression for searching for line items 
    keySearch = ['^cash', 'deposits', 'receivable', '^total assets', 'payable', 'total liabilities$', 'common stock', 
                 'preferred stock', 'earnings', '^(?!.*liabilities).*equity$|^(?!.*liabilities).*equity:$', 
                 '^total liabilities.*equity$|^total liabilities.*equity:$']
    
    # key names reflecting the corresponding the regex names
    keyNames = ['Cash & Equivalents', 'Deposits', 'Receivables', 'Total Assets', 'Payables', 'Total Liabilities',
                'Common Stock', 'Preferred Stock', 'Earnings', 'Total Equity', 'Total Liabilities & Equity']
    
    # concat the list of dataframe for each extraction 
    series = {}
    
    # creating two rows to track the CIK and year information released
    cik, year = file.split('/')[-1].split('-')
    
    series['CIK'] = cik          # CIK number for firm 
    series['Year'] = year[:2]    # Year for firm filing  
    
    # select key for search names
    for i, key in enumerate(keySearch):
        # filter regex for corresponding string expression
        filterSet = df[df.columns[0]].str.contains(key, regex=True, flags=re.IGNORECASE)
        
        # matching names for search and constructing a row (dictionary form)
        # filter dataframe and sum corresponding data column
        series[keyNames[i]] = df[filterSet][df.columns[1]].sum()
    print(series, '\n')
    # match the T-table, asset/liabilities/equity for rows
    mul = tMatch(series['Total Assets'], series['Total Liabilities & Equity'])
    
    if mul != None:
        series['Total Liabilities & Equity'] = series['Total Liabilities & Equity'] * mul
        
        # computing the series for other available line item
        series['Other Assets'] = series['Total Assets'] - series['Cash & Equivalents'] - series['Deposits'] - series['Receivables']
        series['Other Liabilities'] = series['Total Liabilities'] - series['Payables']
        series['Other Equity'] = series['Total Equity'] - series['Common Stock'] - series['Preferred Stock'] - series['Earnings']

        # convert the filtered dictionaries to a dataframe
        comboDF = pd.DataFrame.from_dict(series, orient='index')

        return comboDF.transpose()
    

In [235]:
extraction(cleanDf, selections[index])

{'CIK': '1070671', 'Year': '03', 'Cash & Equivalents': 91706.0, 'Deposits': 77797.0, 'Receivables': 284613.0, 'Total Assets': 503993.0, 'Payables': 147054.0, 'Total Liabilities': 147054.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 0.0, 'Total Equity': 356939.0, 'Total Liabilities & Equity': 503.993}


Unnamed: 0,CIK,Year,Cash & Equivalents,Deposits,Receivables,Total Assets,Payables,Total Liabilities,Common Stock,Preferred Stock,Earnings,Total Equity,Total Liabilities & Equity,Other Assets,Other Liabilities,Other Equity
0,1070671,3,91706,77797,284613,503993,147054,147054,0,0,0,356939,503993,49877,0,356939


In [262]:
itters = []
error = 0

for file in selections:
    # retrieving downloaded files from s3 bucket
    s3.download_file(bucket, file, 'temp.pdf')
    df = pd.read_csv('temp.pdf')
    
    # clean dataframes for only the items and their immediate values
    cleanDF = df[df.columns[1:3]].dropna() 
    
    # convert each string item to a numeric quantity
    cleanDF[cleanDF.columns[1]] = cleanDF[cleanDF.columns[1]].apply(cleanNumeric)
    print(file)
    # export data extraction from dataframe
    tempDF = extraction(cleanDF, file)
    
    if tempDF is not None:
        itters.append(tempDF)
    else:
        error += 1
        
    # remove local file after it has been created
    os.remove('temp.pdf')

Output/BalanceSheet/1049767-18.csv
{'CIK': '1049767', 'Year': '18', 'Cash & Equivalents': 4035049.0, 'Deposits': 0.0, 'Receivables': 3772906.0, 'Total Assets': 7880707.0, 'Payables': 1091652.0, 'Total Liabilities': 1091652.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 0.0, 'Total Equity': 6789055.0, 'Total Liabilities & Equity': 7880707.0} 

Output/BalanceSheet/1000317-20.csv
{'CIK': '1000317', 'Year': '20', 'Cash & Equivalents': 10701.0, 'Deposits': 0.0, 'Receivables': 156.0, 'Total Assets': 10857.0, 'Payables': 3000.0, 'Total Liabilities': 0.0, 'Common Stock': 100.0, 'Preferred Stock': 0.0, 'Earnings': -71302.0, 'Total Equity': 7857.0, 'Total Liabilities & Equity': 10857.0} 

Output/BalanceSheet/1035474-08.csv
{'CIK': '1035474', 'Year': '08', 'Cash & Equivalents': 38774.0, 'Deposits': 0.0, 'Receivables': 0.0, 'Total Assets': 0.0, 'Payables': 6967.0, 'Total Liabilities': 6967.0, 'Common Stock': 2.0, 'Preferred Stock': 0.0, 'Earnings': 7028.0, 'Total Equity': 32647.0, 'To

Output/BalanceSheet/1005256-12.csv
{'CIK': '1005256', 'Year': '12', 'Cash & Equivalents': 69600.0, 'Deposits': 1885.0, 'Receivables': 17124.0, 'Total Assets': 169229.0, 'Payables': 3955.0, 'Total Liabilities': 3955.0, 'Common Stock': 10000.0, 'Preferred Stock': 0.0, 'Earnings': 137399.0, 'Total Equity': 165274.0, 'Total Liabilities & Equity': 169229.0} 

Output/BalanceSheet/1056199-17.csv
{'CIK': '1056199', 'Year': '17', 'Cash & Equivalents': 5367.0, 'Deposits': 0.0, 'Receivables': 0.0, 'Total Assets': 6484.0, 'Payables': 1600.0, 'Total Liabilities': 1600.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 0.0, 'Total Equity': 4884.0, 'Total Liabilities & Equity': 0.0} 

Output/BalanceSheet/1057018-04.csv
{'CIK': '1057018', 'Year': '04', 'Cash & Equivalents': 0.0, 'Deposits': 0.0, 'Receivables': 3400.0, 'Total Assets': 73493.0, 'Payables': 28726.0, 'Total Liabilities': 56439.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 17054.0, 'Total Equity': 0.0, 'Total Liabili

{'CIK': '1056451', 'Year': '11', 'Cash & Equivalents': 1782422.0, 'Deposits': 33081.0, 'Receivables': 157785.0, 'Total Assets': 0.0, 'Payables': 1249480.0, 'Total Liabilities': 0.0, 'Common Stock': 319667.0, 'Preferred Stock': 0.0, 'Earnings': 876527.0, 'Total Equity': 1200763.0, 'Total Liabilities & Equity': 0.0} 

Output/BalanceSheet/1044640-17.csv
{'CIK': '1044640', 'Year': '17', 'Cash & Equivalents': 130422.0, 'Deposits': 0.0, 'Receivables': 253.0, 'Total Assets': 232751.0, 'Payables': 10183.0, 'Total Liabilities': 10183.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 0.0, 'Total Equity': 222568.0, 'Total Liabilities & Equity': 232751.0} 

Output/BalanceSheet/1028553-12.csv
{'CIK': '1028553', 'Year': '12', 'Cash & Equivalents': 413263.0, 'Deposits': 10065.0, 'Receivables': 0.0, 'Total Assets': 1125410.0, 'Payables': 0.0, 'Total Liabilities': 0.0, 'Common Stock': 1000.0, 'Preferred Stock': 0.0, 'Earnings': 1004410.0, 'Total Equity': 0.0, 'Total Liabilities & Equity': 0.0

Output/BalanceSheet/1050645-04.csv
{'CIK': '1050645', 'Year': '04', 'Cash & Equivalents': 25854.0, 'Deposits': 0.0, 'Receivables': 25142.0, 'Total Assets': 0.0, 'Payables': 1030856.0, 'Total Liabilities': 0.0, 'Common Stock': 11.0, 'Preferred Stock': 0.0, 'Earnings': -1316865.0, 'Total Equity': 0.0, 'Total Liabilities & Equity': 0.0} 

Output/BalanceSheet/1025942-11.csv
{'CIK': '1025942', 'Year': '11', 'Cash & Equivalents': 20604974.0, 'Deposits': 1073170.0, 'Receivables': 4040692.0, 'Total Assets': 27603731.0, 'Payables': 1184108.0, 'Total Liabilities': 8534670.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 0.0, 'Total Equity': 0.0, 'Total Liabilities & Equity': 0.0} 

Output/BalanceSheet/1032194-03.csv
{'CIK': '1032194', 'Year': '03', 'Cash & Equivalents': 16425.0, 'Deposits': 0.0, 'Receivables': 1623.0, 'Total Assets': 0.0, 'Payables': 8721.0, 'Total Liabilities': 9722.0, 'Common Stock': 0.0, 'Preferred Stock': 0.0, 'Earnings': 0.0, 'Total Equity': 0.0, 'Total Liabiliti

In [245]:
# # export database built to .csv file
# pd.concat(itters).to_csv('sample.csv', index=False)

In [263]:
pd.concat(itters)

Unnamed: 0,CIK,Year,Cash & Equivalents,Deposits,Receivables,Total Assets,Payables,Total Liabilities,Common Stock,Preferred Stock,Earnings,Total Equity,Total Liabilities & Equity,Other Assets,Other Liabilities,Other Equity
0,1049767,18,4035050.0,0.0,3772910.0,7880710.0,1091650.0,1091650.0,0.0,0,0.0,6789060.0,7880710.0,72752.0,0.0,6789060.0
0,1000317,20,10701.0,0.0,156.0,10857.0,3000.0,0.0,100.0,0,-71302.0,7857.0,10857.0,0.0,-3000.0,79059.0
0,1070297,8,994238.0,0.0,601536.0,2360570.0,1103950.0,1119820.0,1000.0,0,0.0,1240740.0,2360570.0,764792.0,15872.0,1239740.0
0,1047972,7,11279.0,0.0,7324.0,20204.0,1372.0,4372.0,0.0,0,5832.0,15832.0,20204.0,1601.0,3000.0,10000.0
0,1044991,10,151680.0,0.0,1311220.0,1618180.0,3419.0,539664.0,0.0,0,0.0,1078510.0,1618180.0,155277.0,536245.0,1078510.0
0,1014790,10,566170.0,0.0,0.0,2106000.0,1067680.0,1067680.0,0.0,0,0.0,1038320.0,2106000.0,1539830.0,0.0,1038320.0
0,1030532,12,29684000.0,0.0,27563800.0,60367000.0,3408130.0,11138600.0,100.0,0,41598400.0,49228400.0,60367000.0,3119150.0,7730460.0,7629900.0
0,1030410,4,13423.0,0.0,6315.0,20957.0,10747.0,11234.0,905.0,0,-50104.0,9723.0,20957.0,1219.0,487.0,58922.0
0,105464,15,7672080.0,0.0,9292800.0,17056800.0,3185850.0,3254900.0,0.0,0,0.0,13801900.0,17056800.0,91862.0,69049.0,13801900.0
0,1007983,10,676652.0,0.0,64067.0,1027990.0,3761.0,49037.0,0.0,0,0.0,978954.0,1027990.0,287272.0,45276.0,978954.0


In [251]:
cleanDF

Unnamed: 0,0,1
1,Cash and cash equivalents,38774.0
2,Prepaid expenses,840.0
5,Accounts payable - affiliate,6086.0
6,Income taxes payable,881.0
7,Total liabilities,6967.0
9,"Common stock, par value $.01 per share; author...",2.0
10,Additional paid-in capital,25617.0
11,Retained earnings,7028.0
12,Total stockholder's equity,32647.0


In [253]:
df[df.columns[1:3]]

Unnamed: 0,0,1
0,,
1,Cash and cash equivalents,"$ 38,774"
2,Prepaid expenses,840
3,,"$ 39,614"
4,Liabilities and Stockholder's Equity,
5,Accounts payable - affiliate,"$ 6,086"
6,Income taxes payable,881
7,Total liabilities,6967
8,Stockholder's equity:,
9,"Common stock, par value $.01 per share; author...",2


In [265]:
error / len(selections)

0.51