In [4]:
import os
import re
import boto3
import json

import pandas as pd
import numpy as np

from sagemaker.session import Session

# Extract Asset and Liability Line Items
**We divide asset and liability line items according to a regex parse**

In [5]:
def bsSplit(array: np.ndarray) -> tuple:
    """
    Function splits an array by bisection, into asset and liability & equity terms. Assumes that line items
    are recorded according to standard accounting practices in orientation. 
    ------------------------------------------------------------------------------------
    :param: (type numpy.ndarray)
        An array of balance sheet line items for a given broker-dealer
        
    :return: (type tuple)
        Return a tuple of arrays; index 1 is the asset items, index 2 is the liability & equity items and
        index 3 is the index where the split occured
        
    NOTE: We make the assumption that liability line items always fall below asset line items and each 
          balance sheet has both asset and liability line items. If either are missing we avoid balance sheet
    """
    stop_idx1 = 0
    stop_idx2 = array.size        # default to length of line items array
    
    asset_idx = 0        # asset and liability identifiers
    liable_idx = 0
    
    # iterate through the line items as provided by the array
    for i, item in enumerate(array):
        # search string for presence of word 'assets' and liabilites
        val1 = re.search('assets', item, flags=re.I)
        val2 = re.search('liability|liabilities', item, flags=re.I)
        
        # if we find the term "asset" we count this index
        if val1 is not None:
            asset_idx = i + 1    
        
        # if we find the term "liability" we count this index
        if val2 is not None:
            liable_idx = i + 1
        
        # in the event we find an asset and liability term, we check to see if the asset index appears before
        # the liability index, this is to prevent the JP Morgan 2012/2013 Textract Error
        if (asset_idx != 0) and (liable_idx != 0):
            
            if asset_idx < liable_idx:
                stop_idx1 = asset_idx
                stop_idx2 = liable_idx
                
    # we should always keep track of the asset term (this is our primary splitter)
    if (asset_idx != 0) and (liable_idx == 0):
        stop_idx1 = asset_idx

    # check the very last, in event our liability term created an early cut-off (e.g. 42352-2003-01-28)
    if (val1 is None) & (val2 is None):
        stop_idx2 = array.size
    
    # partition the array by the enumerated index for asset and liability portions
    lhs = array[:stop_idx1]
    rhs = array[stop_idx1:stop_idx2]
    
    # if either asset or liability side missing, we return None
    if lhs.size == 0 or rhs.size == 0:
        return None
    else:
        return (lhs, rhs, stop_idx1, stop_idx2)

In [6]:
def lineItems(vector:np.ndarray, df:pd.DataFrame):
    """
    Retrieving balance sheet information line item names for s3 files
    ------------------------------------------------------------------------------------
    :param vector: (type numpy.ndarray)
        An array of file names for .csv files from s3 to iterate through
        
    :return: (type tuple)
        Return a tuple of arrays; left is the asset items, and right is the liability & equity items 
    """
    
    # retrieve the asset and liability & equity terms from the dataframe
    response = bsSplit(vector)
    
    # if response is present we continue 
    if response is not None:
        lhs, rhs, index1, index2 = response      # decompose response object to retrieve index
        
        # save contents of asset and liability splits to AWS S3 bucket
        dfA = df.iloc[:index1]                  # asset dataframe
        dfL = df.iloc[index1:index2]            # liability and equity dataframe
        
        return (dfA, dfL)
    else:
        return None
   

## Final Main Execution

In [9]:
if __name__ == "__main__":
    
    # initiate s3 bucket and corresponding data folder
    bucket = "ran-s3-systemic-risk"
    
    pdf_data_folder ="Output/X-17A-5-CLEAN-PDFS/"
    png_data_folder ="Output/X-17A-5-CLEAN-PNGS/"
    
    pdf_output_folder = 'Output/X-17A-5-SPLIT-PDFS/'
    png_output_folder = 'Output/X-17A-5-SPLIT-PNGS/'
        
    # Amazon Textract client and Sagemaker session
    s3 = boto3.client('s3')
    session = Session()
    
    # discover all of the data that you want to parse (both PDFS and PNGS)
    pdf_paths = np.array(session.list_s3_files(bucket, pdf_data_folder))[1:]
    pdf_asset_split = np.array(session.list_s3_files(bucket, pdf_output_folder + 'Assets/'))
    pdf_liability_split = np.array(session.list_s3_files(bucket, pdf_output_folder + 'Liability & Equity/'))
    
    png_paths = np.array(session.list_s3_files(bucket, png_data_folder))[1:]
    png_asset_split = np.array(session.list_s3_files(bucket, png_output_folder + 'Assets/'))
    png_liability_split = np.array(session.list_s3_files(bucket, png_output_folder + 'Liability & Equity/'))
    
    # --------------------------------------------------------------------------------------------------
    # PDF PROCESSING (LINE-ITEM SPLIT)
    # --------------------------------------------------------------------------------------------------
    
    # iterate through files from s3 bucket 
    for file in pdf_paths:
        print('\n', file)
        fileName = file.split('/')[-1]                                          # file-name for a given path
        asset_name = pdf_output_folder + 'Assets/' + fileName                   # export path to assets
        liability_name = pdf_output_folder + 'Liability & Equity/' + fileName   # export path to liability and equity
        
        # check to see presence of split files 
        if (asset_name not in pdf_asset_split) or (liability_name not in pdf_liability_split):
        
            # download temporary file from s3 bucket
            s3.download_file(bucket, file, 'temp.csv')
            df = pd.read_csv('temp.csv')

            n = df.columns.size   # the number of columns in read dataframe    

            if n > 1: # if there is more than 1 column we continue examination 

                # all line item for balance sheet (first column)
                arr = df[df.columns[0]].dropna().values     

                # extract line items if possible for both asset and liability terms
                response = lineItems(arr, df)

                # if response not None we decompose each
                if response is not None:
                    # unpack the response object to component parts
                    df_asset, df_liability = response

                    # writing data frame to .csv file (we overwrite file name to save space)
                    df_asset.to_csv(fileName, index=False)
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=asset_name, Body=data)

                    df_liability.to_csv(fileName, index=False)
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=liability_name, Body=data)

                    # remove local file after it has been created
                    os.remove(fileName)
                else:
                    print('Issue with splitting balance-sheet table into asset and liability')

            else:
                print('{} incomplete dataframe'.format(file))
                
        else:
            print("We've already downloaded {}".format(fileName))
    
    # --------------------------------------------------------------------------------------------------
    # PNG PROCESSING (LINE-ITEM SPLIT)
    # --------------------------------------------------------------------------------------------------
    
    # iterate through files from s3 bucket 
    for file in png_paths:
        print('\n', file)
        fileName = file.split('/')[-1]                                          # file-name for a given path
        asset_name = png_output_folder + 'Assets/' + fileName                   # export path to assets
        liability_name = png_output_folder + 'Liability & Equity/' + fileName   # export path to liability and equity
        
        # check to see presence of split files 
        if (asset_name not in png_asset_split) or (liability_name not in png_liability_split):
        
            # download temporary file from s3 bucket
            s3.download_file(bucket, file, 'temp.csv')
            df = pd.read_csv('temp.csv')

            n = df.columns.size   # the number of columns in read dataframe    

            if n > 1: # if there is more than 1 column we continue examination 

                # all line item for balance sheet (first column)
                arr = df[df.columns[0]].dropna().values     

                # extract line items if possible for both asset and liability terms
                response = lineItems(arr, df)

                # if response not None we decompose each
                if response is not None:
                    # unpack the response object to component parts
                    df_asset, df_liability = response

                    # writing data frame to .csv file (we overwrite file name to save space)
                    df_asset.to_csv(fileName, index=False)
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=asset_name, Body=data)

                    df_liability.to_csv(fileName, index=False)
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=liability_name, Body=data)

                    # remove local file after it has been created
                    os.remove(fileName)
                else:
                    print('Issue with splitting balance-sheet table into asset and liability')

            else:
                print('{} incomplete dataframe'.format(file))
                
        else:
            print("We've already downloaded {}".format(fileName))
    
    # remove local file for storing cleaned data  
    os.remove('temp.csv')
    print('\nAll X-17A-5 files have been downloaded and split between Asset and Liability & Equity terms')


 Output/X-17A-5-CLEAN-PDFS/1101180-2002-02-28.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2003-02-28.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2004-02-25.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2005-03-02.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2006-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2008-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2009-05-12.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2009-07-20.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2010-03-02.csv

 Output/X-17A-5-CLEAN-PDFS/1101180-2011-03-22.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2004-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2006-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2007-02-26.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2008-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2009-03-02.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2010-02-25.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2011-02-25.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2012-02-28.csv

 Output/X-17A-5-CLEAN-PDFS/1146184-2013-02-26.csv

 Output/X-17A-5-CLEAN-PDFS/114


 Output/X-17A-5-CLEAN-PDFS/29648-2004-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2005-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2006-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2007-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2008-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2008-04-07.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2008-05-13.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2009-03-02.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2010-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2011-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2011-03-30.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2012-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2013-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2014-02-28.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2015-03-02.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2016-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2017-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2018-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2019-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/29648-2021-02-26.csv

 Output/X-17A-5-CLE


 Output/X-17A-5-CLEAN-PDFS/753835-2006-03-10.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2007-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2008-03-07.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2009-03-02.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2010-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2011-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2012-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2013-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2014-03-04.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2015-03-02.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2016-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2017-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2018-03-14.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2019-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/753835-2020-02-26.csv

 Output/X-17A-5-CLEAN-PDFS/754542-2002-03-08.csv

 Output/X-17A-5-CLEAN-PDFS/754542-2003-03-03.csv

 Output/X-17A-5-CLEAN-PDFS/754542-2004-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/754542-2005-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/754542-2006-03-01.csv



 Output/X-17A-5-CLEAN-PDFS/874362-2011-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/874362-2012-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/874362-2013-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/874362-2014-03-05.csv

 Output/X-17A-5-CLEAN-PDFS/874362-2015-02-27.csv

 Output/X-17A-5-CLEAN-PDFS/874362-2016-02-29.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2002-03-15.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2003-02-27.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2004-02-27.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2005-02-28.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2006-02-28.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2006-09-21.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2007-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2008-03-03.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2009-02-27.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2010-03-01.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2011-02-28.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2012-02-27.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2012-03-05.csv

 Output/X-17A-5-CLEAN-PDFS/87634-2013-02-26.csv

 Output/X-17A


 Output/X-17A-5-CLEAN-PNGS/1146184-2017-02-24.csv

 Output/X-17A-5-CLEAN-PNGS/1146184-2018-02-27.csv

 Output/X-17A-5-CLEAN-PNGS/1146184-2019-02-28.csv

 Output/X-17A-5-CLEAN-PNGS/1146184-2020-02-28.csv

 Output/X-17A-5-CLEAN-PNGS/1146184-2021-02-25.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2004-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2005-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2006-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2007-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2008-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2009-03-04.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2010-02-26.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2011-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2012-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2013-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2014-02-28.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2015-02-27.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2016-06-16.csv

 Output/X-17A-5-CLEAN-PNGS/1215680-2017-03-07.csv

 Output/X-17A-5-CLEAN-PNGS/121


 Output/X-17A-5-CLEAN-PNGS/318336-2004-02-27.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2004-04-07.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2005-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2006-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2007-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2008-03-26.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2009-03-02.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2010-03-03.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2011-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2012-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2013-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2014-03-05.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2015-03-02.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2016-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2017-03-06.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2018-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2019-03-08.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2020-02-28.csv

 Output/X-17A-5-CLEAN-PNGS/318336-2021-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/356628-2002-02-25.csv



 Output/X-17A-5-CLEAN-PNGS/754542-2008-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2002-01-29.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2003-01-30.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2004-01-30.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2004-05-07.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2005-01-31.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2006-01-30.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2007-01-29.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2008-01-31.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2009-03-02.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2010-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2011-03-02.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2012-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2013-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2014-03-05.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2015-02-27.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2016-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2017-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2018-02-27.csv

 Output/X-17A-5-CLEAN-PNGS/782124-2018-02-28.csv



 Output/X-17A-5-CLEAN-PNGS/87634-2015-02-25.csv

 Output/X-17A-5-CLEAN-PNGS/87634-2016-02-25.csv

 Output/X-17A-5-CLEAN-PNGS/87634-2017-02-27.csv

 Output/X-17A-5-CLEAN-PNGS/87634-2018-02-27.csv

 Output/X-17A-5-CLEAN-PNGS/87634-2019-02-26.csv

 Output/X-17A-5-CLEAN-PNGS/87634-2020-02-27.csv

 Output/X-17A-5-CLEAN-PNGS/87634-2021-02-26.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2002-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2003-03-03.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2004-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2005-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2006-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2007-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2008-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2009-03-02.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2010-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2011-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2012-02-29.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2013-03-01.csv

 Output/X-17A-5-CLEAN-PNGS/877559-2014-03-04.csv

 Outpu

In [8]:
# # runnning singular analysis on a particular dataset
# # s3.download_file('ran-s3-systemic-risk', 'Output/X-17A-5-CLEAN-PNGS/782124-2014-03-05.csv', 'temp.csv')
# # s3.download_file('ran-s3-systemic-risk', 'Output/X-17A-5-CLEAN-PDFS/42352-2003-01-28.csv', 'temp.csv')
# s3.download_file('ran-s3-systemic-risk', 'Output/X-17A-5-CLEAN-PDFS/782124-2013-03-01.csv', 'temp.csv')

# df = pd.read_csv('temp.csv')
# os.remove('temp.csv')

# # all line item for balance sheet (first column)
# arr = df[df.columns[0]].dropna().values     

# # extract line items if possible for both asset and liability terms
# response = lineItems(arr, df)