In [53]:
import os
import re
import boto3
import json

import pandas as pd
import numpy as np

from sagemaker.session import Session

In [54]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Output/X-17A-5-BS/"

# Amazon Textract client and Sagemaker session
s3 = boto3.client('s3')
session = Session()

In [55]:
def bsSplit(array: np.ndarray) -> tuple:
    """
    Function splits an array by bisection, into asset and liability & equity terms. Assumes that line items
    are recorded according to standard accounting practices in orientation. 
    
    :param: (type np.ndarray)
        An array of balance sheet line items for 
        
    :return: (type tuple)
        Return a tuple of arrays; left is the asset items, and right is the liability & equity items 
    """
            
    # iterate through the line items as provided by the array
    for i, item in enumerate(array):
        # search string for presence of word 'liabilties' or 'liability' 
        val = re.search('liabilities|liability', item, flags=re.I)

        # if we find the term 'liability' or 'liabilities' we split the line items and break inner loop
        if val is not None:

            # partition by the enumerated index for asset and liability portions
            lhs = array[:i]
            rhs = array[i:]
            return (lhs, rhs)

In [59]:
def lineItems(fileNames:np.ndarray) -> tuple:
    """
    Retrieving balance sheet information line item names for s3 files
    
    :param: (type np.ndarray)
        An array of file names for .csv files from s3 to iterate through
        
    :return: (type tuple)
        Return a tuple of arrays; left is the asset items, and right is the liability & equity items 
    """
    
    # size of iterable place holders 
    n = fileNames.size
    
    # initialize asset and liability list to store items
    Asset = [0] * n
    LiabilityandEquity = [0] * n

    # iterate through files from s3 bucket 
    for i, file in enumerate(fileNames):
        
        # download temporary file from s3 bucket
        s3.download_file(bucket, file, 'temp.pdf')
        df = pd.read_csv('temp.pdf')

        n = df.columns.size   # the number of columns in read dataframe
        
        if n > 1: # if there is more than 1 column we continue examination 
            
            arr = df[df.columns[0]].dropna().values     # all line item for balance sheet (first column)

            # retrieve the asset and liability & equity terms from the dataframe
            lhs, rhs = bsSplit(arr)

            # append the sequence of each line item 
            Asset[i] = lhs
            LiabilityandEquity[i] = rhs
        else:
            print('{} incomplete dataframe'.format(file))
                    
        # remove local file after it has been created
        os.remove('temp.pdf')
    
    # flatten series for all values
    left = np.hstack(np.array(Asset))
    right = np.hstack(np.array(LiabilityandEquity))
    
    # convert arrays to proper set 
    left = np.unique(left)
    right = np.unique(right)
    
    return (left.tolist(), right.tolist())

In [60]:
# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

# return all line item splits
lefthand, righthand = lineItems(paths[1:])

                                                    0              1
0                                              ASSETS            NaN
1                           Cash and cash equivalents      $ 222,336
2   Cash and securities deposited with clearing or...            NaN
3              in compliance with federal regulations      9,071,138
4     Securities purchased under agreements to resell     34,764,794
5                                 Securities borrowed     51,094,781
6                   Securities received as collateral      3,037,956
7                                        Receivables:            NaN
8                                           Customers     12,373,732
9                         Brokers, dealers and others      2,751,879
10                             Interest and dividends        137,260
11  Financial instruments owned, at fair value ($2...     26,884,157
12                                       Other assets        317,588
13                                

In [48]:
lefthand

['ASSETS',
 'Brokers, dealers and others',
 'Cash and cash equivalents',
 'Cash and securities deposited with clearing organizations or segregated',
 'Customers',
 'Financial instruments owned, at fair value ($22,226,481 pledged as collateral)',
 'Financial instruments owned, at fair value:',
 'Interest and dividends',
 'Not pledged as collateral',
 'Other assets',
 'Pledged as collateral',
 'Receivables:',
 'Securities borrowed',
 'Securities purchased under agreements to resell',
 'Securities received as collateral',
 'TOTAL ASSETS',
 'in compliance with federal regulations']

In [49]:
righthand

['Accrued employee compensation and benefits',
 'Adjustable Rate Cumulative Preferred Stock, Series A, held in treasury, at cost (151 shares)',
 'Brokers, dealers and others',
 'Commitments and contingencies (Note 12)',
 'Common stock, $1.00 par value; 1,000 shares authorized and outstanding',
 'Common stock, $1.00 par value; 1,000 shares authorized and outstanding Paid-in capital',
 'Customers',
 'Financial instruments sold, but not yet purchased, at fair value',
 'Interest and dividends',
 "LIABILITIES AND STOCKHOLDER'S EQUITY",
 'Obligation to return securities received as collateral',
 'Other liabilities and accrued expenses',
 'Paid-in capital',
 'Payables:',
 'Preferred stock, $1.00 par value (Adjustable Rate Cumulative Preferred Stock, Series A, $500,000 liquidation preference); 1,000 shares authorized; 300 shares issued',
 'Retained earnings',
 'Securities loaned',
 'Securities sold under agreements to repurchase',
 'Short-term borrowings',
 "Stockholder's Equity",
 'Subordinat

In [51]:
# storing unique list of asset items and liability line items
with open('assetLines.txt', 'w') as f: json.dump(lefthand, f)
with open('liabilityLines.txt', 'w') as f: json.dump(righthand, f)