In [6]:
import os
import re
import boto3
import json

import pandas as pd
import numpy as np

from sagemaker.session import Session

# Extract Asset and Liability Line Items
**We divide asset and liability line items according to a regex parse**

In [15]:
def bsSplit(array: np.ndarray) -> tuple:
    """
    Function splits an array by bisection, into asset and liability & equity terms. Assumes that line items
    are recorded according to standard accounting practices in orientation. 
    ------------------------------------------------------------------------------------
    :param: (type numpy.ndarray)
        An array of balance sheet line items for a given broker-dealer
        
    :return: (type tuple)
        Return a tuple of arrays; index 1 is the asset items, index 2 is the liability & equity items and
        index 3 is the index where the split occured
        
    NOTE: We make the assumption that liability line items always fall below asset line items and each 
          balance sheet has both asset and liability line items. If either are missing we avoid balance sheet
    """
    idx = 0
    
    # iterate through the line items as provided by the array
    for i, item in enumerate(array):
        # search string for presence of word 'assets' 
        val = re.search('assets', item, flags=re.I)
        
        ## Potential future update - check liability handle versus asset position 
        ## (should never have an asset index after the liability portion - JP Morgan 2012/2013 Error)
        
        # if we find the term "asset" we split the line items and break inner loop
        if val is not None:
            idx = i + 1    
            
    # partition the array by the enumerated index for asset and liability portions
    lhs = array[:idx]
    rhs = array[idx:]
    
    # if either asset or liability side missing, we return None
    if lhs.size == 0 or rhs.size == 0:
        return None
    else:
        return (lhs, rhs, idx)

In [8]:
def lineItems(vector:np.ndarray, df:pd.DataFrame):
    """
    Retrieving balance sheet information line item names for s3 files
    ------------------------------------------------------------------------------------
    :param vector: (type numpy.ndarray)
        An array of file names for .csv files from s3 to iterate through
        
    :return: (type tuple)
        Return a tuple of arrays; left is the asset items, and right is the liability & equity items 
    """
    
    # retrieve the asset and liability & equity terms from the dataframe
    response = bsSplit(vector)
    
    # if response is present we continue 
    if response is not None:
        lhs, rhs, index = response      # decompose response object to retrieve index
        
        print('\tAsset Lineitems')
        print(lhs)
        print('\tLiability & Equity Lineitems')
        print(rhs)
        
        # save contents of asset and liability splits to AWS S3 bucket
        dfA = df.iloc[:index]            # asset dataframe
        dfL = df.iloc[index:]            # liability and equity dataframe
        
        return (dfA, dfL)
    else:
        return None
   

In [5]:
if __name__ == "__main__":
    
    # initiate s3 bucket and corresponding data folder
    bucket = "ran-s3-systemic-risk"
    data_folder ="Output/X-17A-5-CLEAN/"
    output_folder = 'Output/X-17A-5-SPLIT/'
    temp_folder = 'Temp/'
        
    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    session = Session()
    
    # temp directory where JSON files is stored
    temp = np.array(session.list_s3_files(bucket, temp_folder))
    
    # discover all of the pdfs that you want to parse
    paths = np.array(session.list_s3_files(bucket, data_folder))[1:]
    asset_split = np.array(session.list_s3_files(bucket, output_folder + 'Assets/'))
    liability_split = np.array(session.list_s3_files(bucket, output_folder + 'Liability & Equity/'))
   
    # iterate through files from s3 bucket 
    for file in paths:
        print('\n', file)
        fileName = file.split('/')[-1]                                      # file-name for a given path
        asset_name = output_folder + 'Assets/' + fileName                   # export path to assets
        liability_name = output_folder + 'Liability & Equity/' + fileName   # export path to liability and equity
        
        # check to see presence of split files 
        if (asset_name not in asset_split) or (liability_name not in liability_split):
        
            # download temporary file from s3 bucket
            s3.download_file(bucket, file, 'temp.csv')
            df = pd.read_csv('temp.csv')

            n = df.columns.size   # the number of columns in read dataframe    

            if n > 1: # if there is more than 1 column we continue examination 

                # all line item for balance sheet (first column)
                arr = df[df.columns[0]].dropna().values     

                # extract line items if possible for both asset and liability terms
                response = lineItems(arr, df)

                # if response not None we decompose each
                if response is not None:
                    # unpack the response object to component parts
                    df_asset, df_liability = response

                    # writing data frame to .csv file (we overwrite file name to save space)
                    df_asset.to_csv(fileName, index=False)
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=asset_name, Body=data)

                    df_liability.to_csv(fileName, index=False)
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=liability_name, Body=data)

                    # remove local file after it has been created
                    os.remove(fileName)

            else:
                print('{} incomplete dataframe'.format(file))
                
        else:
            print("We've already downloaded {}".format(fileName))
        
    # remove local file for storing cleaned data  
    os.remove('temp.csv')
    print('\nAll X-17A-5 files have been downloaded and split between Asset and Liability & Equity terms')


 Output/X-17A-5-CLEAN/1224385-2004-03-01.csv
We've already downloaded 1224385-2004-03-01.csv

 Output/X-17A-5-CLEAN/1224385-2005-03-01.csv
We've already downloaded 1224385-2005-03-01.csv

 Output/X-17A-5-CLEAN/1224385-2006-03-01.csv
We've already downloaded 1224385-2006-03-01.csv

 Output/X-17A-5-CLEAN/1224385-2007-03-01.csv
We've already downloaded 1224385-2007-03-01.csv

 Output/X-17A-5-CLEAN/1224385-2008-02-29.csv
We've already downloaded 1224385-2008-02-29.csv

 Output/X-17A-5-CLEAN/1224385-2009-03-02.csv
We've already downloaded 1224385-2009-03-02.csv

 Output/X-17A-5-CLEAN/1224385-2010-03-12.csv
We've already downloaded 1224385-2010-03-12.csv

 Output/X-17A-5-CLEAN/1224385-2011-03-01.csv
We've already downloaded 1224385-2011-03-01.csv

 Output/X-17A-5-CLEAN/1224385-2012-02-29.csv
We've already downloaded 1224385-2012-02-29.csv

 Output/X-17A-5-CLEAN/1224385-2013-03-01.csv
We've already downloaded 1224385-2013-03-01.csv

 Output/X-17A-5-CLEAN/1224385-2014-03-04.csv
We've already 