In [1]:
# Run on first instance to install required libraries
%pip install smart_open minecart textract-trp

Collecting smart_open
  Downloading smart_open-5.1.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 3.2 MB/s eta 0:00:011
[?25hCollecting minecart
  Downloading minecart-0.3.0-py3-none-any.whl (23 kB)
Collecting textract-trp
  Downloading textract_trp-0.1.3-py3-none-any.whl (5.8 kB)
Collecting pdfminer3k
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 6.0 MB/s eta 0:00:01
Installing collected packages: pdfminer3k, textract-trp, smart-open, minecart
Successfully installed minecart-0.3.0 pdfminer3k-1.3.4 smart-open-5.1.0 textract-trp-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import time 
import re
import os
import trp
import boto3
import minecart
import json
import logging 

import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session

# AWS Asynchronous Textract Script (requesting Job)
**Content modified from Amazon AWS Textract repository (refer to [URL](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py) below)** 

In [3]:
def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    # initialize return and client object
    response = None                         
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,     # location of data to be read from s3 bucket 
                'Name': objectName}},       # file name to be read from Textract  
        FeatureTypes=['FORMS', 'TABLES']    # selecting FORMS (key-values) and TABLES from the OCR
    )
    
    # return response job ID for service
    return response["JobId"]

In [4]:
def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when queued
    """
    # allow for interal sleep timer (efficiency)
    time.sleep(1)                               
    
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    # job-status of the response object 
    status = response["JobStatus"]                        
    print("Job status: {}".format(status))
    
    # if job still running check current status every 5 seconds
    while(status == "IN_PROGRESS"):
        
        # time lag before reporting status
        time.sleep(5)                                         
        response = client.get_document_analysis(JobId=jobId)
        
        # job-status of the response object
        status = response["JobStatus"]                        
        print("Job status: {}".format(status))
    
    return status

In [5]:
def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after job status is completed
    """
    # initialize list object to track pages read
    pages = []                    

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    # add first page response to list (length of pages will be arbitrary) 
    pages.append(response)      
    print("Resultset page recieved: {}".format(len(pages)))
    
    # if NextToken present we have a pointer to page (e.g. Response -> Page) 
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure (assuming nextToken not None)
    while(nextToken):
        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        
        # move along linked-list for presence of NextToken response
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']
    
    # return amalgamation of all page responses 
    return pages

In [6]:
def runJob(bucket:str, key:str):
    """
    Function designed to call an AWS Textract job (implements helper function above)
    """
    jobId = startJob(bucket, key)   
    print("Started job with id: {}".format(jobId))

    # if job is complete on AWS return page responses 
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
        
    return response

# AWS Extraction Scripts (Key-Value Pairs)
**The content was modified from AWS to extract key-value pairs in form documents from Block objects that are stored in a map. (refer to [URL](https://docs.aws.amazon.com/textract/latest/dg/examples-extract-kvp.html))**

In [7]:
def find_value_block(key_block, value_map):
    """
    Retrieving value block from AWS textract job, this contains the value text 
    """
    # iterate through the key blocks in the FORM relationships (should have a VALUE and CHILD type, n=2)
    for relationship in key_block['Relationships']:
        
        # if our key block object type is a VALUE we examine the relationship ID
        # NOTE WE SHOULD HAVE ONLY ONE ID FOR THE VALUE RELATIONSHIP TYPE
        if relationship['Type'] == 'VALUE':
            
            # singular ID item stored in list object (return value block object)
            for value_id in relationship['Ids']:
                value_block = value_map[value_id]
            
    # return all corresponding value series
    return value_block

In [8]:
def get_kv_relationship(key_map, value_map, block_map):
    """
    Retrieving the Key-Value relationship from FORM OCR Textract 
    """
    # initialize key-map dictionary for lineitems and corresponding accounting values
    key_value_map = {}
    
    # unpack the key_map to retrieve the block id and key names
    for block_id, key_block in key_map.items():

        # retrieve value block provided the key_block from each block id
        value_block = find_value_block(key_block, value_map)

        # get text value from key and value blocks
        key = get_text(key_block, block_map)
        val = get_text(value_block, block_map)
        
        # map the key and value pairs (e.g. {'Total Assets':'$ 189,232'})
        key_value_map[key] = val
        
    return key_value_map

In [9]:
def get_text(result, blocks_map):
    """
    Retrieving text values from given block object
    """
    # initialize container for text
    text = ''
    
    # if relationships header exists we can extract CHILD header
    if 'Relationships' in result:
        
        # relationship maps to a list (iterate through to reveal a dictionary)
        # e.g. 'Relationships' : [{'Type' : 'CHILD', 'Ids': ['e2b3b12f-ebb7-4f6e-914f-97b315672530']}]
        for relationship in result['Relationships']:
            
            # if relationship type is CHILD we explore job-id (indicates good fit)
            if relationship['Type'] == 'CHILD':
                
                # iterate through Ids list
                for child_id in relationship['Ids']:
                    
                    # select corresponding CHILD_ID from block map, this is sub-dictionary
                    word = blocks_map[child_id]
                    
                    # if block type is a word then we append with a space
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                        
                    # if block type is a selection element (e.g. an option button/mark)
                    # note we treat these cases with an X to denote an optional field 
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] == 'SELECTED':
                            text += 'X '    
    
    # return string corresponding with word 
    return text

# OCR Wrapper Functions
**The scripts perform an OCR job from AWS Textract, and returning well formated data**

In [10]:
def trp2df(table:trp.Table) -> pd.DataFrame:
    """
    Function designed to convert a trp table into a dataframe Complexity -> O(n^2) approx.
    ------------------------------------------------------------------------------------------
    Input
        :param table: (type trp.Table)
            A trp table object parsed from a pdf using AWS Textract  
    
    Output
        :return: type pandas.DataFrame
            A DataFrame object that is constructed by deconstructed a Textract trp table
    """
    N = len(table.rows)               # number of rows in table
    M = len(table.rows[0].cells)      # number of columns in table
    arr = [0]*N                       # initialize matrix container
    
    # iterate through each row within the provided table
    for row in np.arange(N):
        
        # strip the text from the cell references to construct (N X M) matrix
        arr[row] = [table.rows[row].cells[col].text.strip() for col in np.arange(M)]    # move column-wise to get text
        
    return pd.DataFrame(arr)

In [35]:
def readTable(response:list) -> tuple:
    """
    Function to transform AWS Textract object to a dataframe, by searching for tables
     ------------------------------------------------------------------------------------------
    Input
        :param response: (type list)
            An AWS Textract response object corresponding to pages of a given document page 
    
    Output
        :return: type tuple
            A (3x1) tuple is returned, storing the concated dataframe at the first index, and the accompanying 
            trp page objects for where the balance sheet was determined to reside at the second index
    """
    
    catDF = []          # in the event multiple tables detected on one page (concat them)
    page_series = []    # keep track of page objects where balance sheet was flagged
    page_nums = []
    page_count = 0
    
    # format the Textract response type 
    doc = trp.Document(response)
    
    # iterate through document pages
    for page in doc.pages:
        
        # itterate through page tables
        for table in page.tables: 
            
            # convert trp-table into dataframe object
            df = trp2df(table)
            
            # remove columns that are completely empty
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
  
            # number of columns in dataframe
            n = df.columns.size
            
            # reset the column names (avoid the column names)
            df.columns = np.arange(n)
            
            ##############################################################
            #                           NOTES
            #          a good dataframe should have 2-3 columns
            #      anything more or less is a reading error we ignore
            ##############################################################
            
            # if the dataframe has more than 3 columns then we most likley have an issue in parsing
            if n > 3:
                pass
            
            elif n > 1:
                
                ##############################
                # Balance Sheet Assummptions
                ##############################
                
                # this is the column with all line items (e.g. Cash, Total Assets, Total Liabilites)
                lineIndex = df.columns[0]

                # check for the word "cash" in a string at the begining, ignoring case sensitivity (asset check)
                assetCheck = df[lineIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

                # check for the word "Liabilities" in a string at the end, ignoring case sensitivity (liability check)
                debtCheck1 = df[lineIndex].str.contains('Liabilities$|^Liabilities', regex=True, flags=re.IGNORECASE)
                debtCheck2 = df[lineIndex].str.contains('Liability$|^Liability', regex=True, flags=re.IGNORECASE)
                
                # check for the presence of $ sign, we assume the balance sheet items should have at least one $ sign
                # this check is used to avoid reading the table of contents, which was flagged in prior reads
                dollarCheck = df[df.columns[1]].str.contains('\$[^\]]+', regex=True, flags=re.IGNORECASE)
                
                ##############################
                # Balance Sheet Determination
                ##############################
                
                # check if the key words have been found 
                check1 = df[assetCheck | debtCheck1 | debtCheck2].empty      # check for line item terms
                check2 = df[dollarCheck == True].empty                       # check for presence of '$' sign  
                check3 = df[debtCheck1 == True].empty                        # debt check for Liabilities
                check4 = df[debtCheck2 == True].empty                        # debt check for Liability 
                
                # if either asset term or liability term is found, with a $ sign we append the dataframe
                if not check1 and not check2:
                    
                    # we append pages since asset and liablility tables are often seperate
                    # there is no loss of generality if asset and liability terms are in one table
                    catDF.append(df)                
                    
                    # we want to keep track of pages that have been deemed as balance sheet
                    if page not in page_series:
                        page_series.append(page)   # only append if page isn't already recorded
                        page_nums.append(page_count)
                        
                    if not check3 or not check4:
                        # if liability table was found on the first iteration we simply concat data frames 
                        return (pd.concat(catDF), page_series, page_nums)
                    
        page_count += 1
        

In [31]:
def readPNG(pages:list, png_path:str, bucket='ran-s3-systemic-risk') -> tuple:
    """
    Function to transform AWS Textract object to a dataframe, by searching for tables
     ------------------------------------------------------------------------------------------
    Input
        :param response: (type list)
            
    
    Output
        :return: type tuple
          
    """
    subfolder = png_path.split('/')[-2]      # subfolder where PNG files are stored
    
    # construct PNG directories with relevant pages
    textract_paths = [png_path + subfolder + '-p{}.png'.format(idx) for idx in pages]
    
    catDF = []          # in the event multiple pages we concat them
    
    # path iterates through each png image matching the page numbers found in PDFs
    for path in textract_paths:
        
        try:
            # temporary data frame object for balance sheet information
            res = runJob(bucket, path)
            
            # if Textract job did not fail we continue extraction
            if res[0]['JobStatus'] != 'FAILED':

                # format the Textract response type 
                doc = trp.Document(res)

                # iterate through document pages
                for page in doc.pages:
                    
                    # itterate through page tables
                    for table in page.tables: 

                        # convert trp-table into dataframe object
                        df = trp2df(table)
                        
                        # remove columns that are completely empty
                        empty_cols = [col for col in df.columns if (df[col] == '').all()]
                        df = df.drop(empty_cols, axis=1)

                        # number of columns in dataframe
                        n = df.columns.size

                        # reset the column names (avoid the column names)
                        df.columns = np.arange(n)
                        
                        ##############################################################
                        #                           NOTES
                        #          a good dataframe should have 2-3 columns
                        #      anything more or less is a reading error we ignore
                        ##############################################################

                        # if the dataframe has more than 3 columns then we most likley have an issue in parsing
                        if n > 3:
                            pass 

                        elif n > 1:

                            ##############################
                            # Balance Sheet Assummptions
                            ##############################

                            # this is the column with all line items (e.g. Cash, Total Assets, Total Liabilites)
                            lineIndex = df.columns[0]

                            # check for the word "cash" in a string at the begining, ignoring case sensitivity 
                            assetCheck = df[lineIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

                            # check for the word "Liabilities" in a string at the end, ignoring case sensitivity 
                            debtCheck1 = df[lineIndex].str.contains('Liabilities$|^Liabilities', 
                                                                    regex=True, flags=re.IGNORECASE)
                            debtCheck2 = df[lineIndex].str.contains('Liability$|^Liability', 
                                                                    regex=True, flags=re.IGNORECASE)

                            # check for the presence of $ sign, we assume the balance sheet items should have 
                            # this check is used to avoid reading the table of contents, which was flagged in prior reads
                            dollarCheck = df[df.columns[1]].str.contains('\$[^\]]+', regex=True, flags=re.IGNORECASE)

                            ##############################
                            # Balance Sheet Determination
                            ##############################

                            # check if the key words have been found 
                            check1 = df[assetCheck | debtCheck1 | debtCheck2].empty      # check for line item terms
                            check2 = df[dollarCheck == True].empty                       # check for presence of '$' sign  
                            check3 = df[debtCheck1 == True].empty                        # debt check for Liabilities
                            check4 = df[debtCheck2 == True].empty                        # debt check for Liability 

                            # if either asset term or liability term is found, with a $ sign we append the dataframe
                            if not check1 and not check2:

                                # we append pages since asset and liablility tables are often seperate
                                # there is no loss of generality if asset and liability terms are in one table
                                catDF.append(df)                

                                if not check3 or not check4:
                                    # if liability table was found on the first iteration we simply concat data frames 
                                    return pd.concat(catDF)
        
        # broad exeption to catch Textract parsing errors
        except:
            pass
    
    # default return None
    return None

In [32]:
def readForm(doc_pages:list) -> dict:
    """
    Function to transform AWS Textract object to a dictionary, by searching for key value pairs
    ------------------------------------------------------------------------------------------
    Input
        :param doc_pages: (type list)
            TRP page(s) for a AWS Textract response object corresponding to pages of a given document page 
    
    Output
        :return: type dict
            A python dictionary that maps KEYS (line items) with VALUES (corresponding records) for broker
            dealers balance sheet (e.g. {'Cash and cash equivalents : $ 12,513})
    """
    
    # initializing dictionary maps for KEY and VALUE pairs
    key_map = {}
    value_map = {}
    block_map = {}

    # iterate through document pages
    for page in doc_pages:

        # itterate through page tables
        for block in page.blocks: 

            # store the block id in map to retrive information later
            block_id = block['Id']
            block_map[block_id] = block

            # if Key-value set has been seen we deconstruct each KEY and VALUE map
            if block['BlockType'] == "KEY_VALUE_SET":

                # if KEY is labeled as entity type then we found Key, else we found VALUE
                if 'KEY' in block['EntityTypes']:
                    key_map[block_id] = block
                else:
                    value_map[block_id] = block
    
    # convert block objects to text dictionary map
    return get_kv_relationship(key_map, value_map, block_map)

In [33]:
def readText(doc_pages:list) -> dict:
    """
    Function to transform AWS Textract object to a dictionary of text values and confidence 
    ------------------------------------------------------------------------------------------
    Input
        :param doc_pages: (type list)
            TRP page(s) for a AWS Textract response object corresponding to pages of a given document page
    
    Output
        :return: type dict
            A python dictionary that maps TEXT (line items) with corresponding confidence figures as reported
            by AWS Textract object (e.g. {'Cash and cash equivalents : 99.97891})
    """
    # initializing dictionary maps for text
    text_map = {}
    
    # iterate through document pages
    for page in doc_pages:
        
        # itterate through page tables
        for block in page.blocks: 
            
            # if our block type is a line, we map the line text and confidence
            if block['BlockType'] == "LINE":
                text_map[block['Text']] = block['Confidence']
    
    # return completed text to confidence map
    return text_map

## Extract Balance Sheet information

In [34]:
def textractParse(pdf_path:str, png_path:str, bucket:str) -> dict:
    """
    Function runs a Textract job and saves Balance Sheet information to .csv file in s3 folder 
    """
    errors = ''
    
    # temporary data frame object for balance sheet information
    res = runJob(bucket, pdf_path)
    
    # if Textract job did not fail we continue extraction
    if res[0]['JobStatus'] != 'FAILED':

        # perform OCR and return balance sheet with corresponding page object(s)
        tb_response = readTable(res)           
        
        # checks for type of return, if none then we log an error
        if type(tb_response) == tuple:
            
            # deconstruct the table response tuple into dataframe and page object parts
            df1, page_obj, page_num = tb_response
            print('\nPage number(s) for extraction in PNG are {}\n'.format(page_num))
            
            # try to extract from a PNG (we can still return a None here)
            df2 = readPNG(page_num, png_path)
            
            # provided balance sheet page number we select FORM and TEXT data
            forms_data = readForm(page_obj)      
            text_data = readText(page_obj)        
            
            print('\nTextract-PDF dataframe')
            print(df1)
            
            print('\nTextract-PNG dataframe')
            print(df2)
            
            return (df1, df2, forms_data, text_data, None)
        else:
            error = 'No Balance Sheet found, or parsing error'
            return (None, None, None, None, error)
    else:
        error = 'Could not parse, JOB FAILED'
        return (None, None, None, None, error)

## Main File Execution

In [37]:
if __name__ == "__main__":

    # Amazon Textract client and Sagemaker session
    textract = boto3.client('textract')
    s3 = boto3.client('s3')
    session = Session()
    
    # initiate s3 bucket and corresponding data/output folder
    bucket = 'ran-s3-systemic-risk'
    
    data_png_folder = 'Input/X-17A-5-PNG-SUBSETS/'
    data_pdf_folder = 'Input/X-17A-5-PDF-SUBSETS/'
    
    output_png_folder = 'Output/X-17A-5-PNG-RAW/'
    output_pdf_folder = 'Output/X-17A-5-PDF-RAW/'
    
    temp_folder = 'Temp/'
    
    # csv directory where we store balance sheet information 
    output_png_csvs = np.array(session.list_s3_files(bucket, output_png_folder))
    output_pdf_csvs = np.array(session.list_s3_files(bucket, output_pdf_folder))
    
    # temp directory where JSON files is stored
    temp = np.array(session.list_s3_files(bucket, temp_folder))
    
    # pdf directory where we store the broker-dealer information 
    pdf_files = np.array(session.list_s3_files(bucket, data_pdf_folder))[1:]
    png_files = np.array(session.list_s3_files(bucket, data_png_folder))[1:]
    png_file_directory = list(set((map(lambda x: '/'.join(x.split('/')[:-1]), png_files))))
    
    # ===========================================================================
    # Load in Temp JSON files if present (FORM, TEXT, ERROR)
    # ===========================================================================
    
    if 'Temp/X17A5-FORMS.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/X17A5-FORMS.json', 'temp1.json')
        
        # read data on KEY-VALUE dictionary (i.e Textract FORMS) 
        with open('temp1.json', 'r') as f: forms_dictionary = json.loads(f.read())
        
        # remove local files for JSON
        os.remove('temp1.json')
    else:
        forms_dictionary = {}
    
    if 'Temp/X17A5-TEXT.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/X17A5-TEXT.json', 'temp2.json')
        
        # read data on TEXT-Confidence dictionary
        with open('temp2.json', 'r') as f: text_dictionary = json.loads(f.read())  
            
        # remove local files for JSON
        os.remove('temp2.json')
    else:
        text_dictionary = {}
    
    if 'Temp/ERROR-TEXTRACT.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/ERROR-TEXTRACT.json', 'temp3.json')
        
        # read data on errors derived from Textract
        with open('temp3.json', 'r') as f: error_dictionary = json.loads(f.read()) 
            
        # remove local files for JSON
        os.remove('temp3.json')
    else:
        error_dictionary = {}
    
    # ===========================================================================
    # Perform Textract analysis on PDFs and PNGs
    # ===========================================================================
    
    # e.g. ['Input/X-17A-5-PDF-SUBSETS/42352-2012-02-29-subset.pdf'] otherwise pdf_files (full sample)
    select_sample = ['Input/X-17A-5-PDF-SUBSETS/1146184-2010-02-25-subset.pdf']

    for pdf_paths in pdf_files:
        
        # baseFile name to name export .csv file e.g. 1224385-2004-03-01.csv
        basefile = pdf_paths.split('/')[-1].split('-subset')[0]
        fileName = basefile + '.csv'
        print('\nPerforming OCR for {}'.format(fileName))
        
        # if file is not found in directory we extract the balance sheet
        # WE LOOK TO AVOID RE-RUNNING OLD TEXTRACT PARSES TO SAVE TIME
        if (output_pdf_folder + fileName not in output_pdf_csvs):
            
            # run Textract OCR job and extract the parsed data 
            png_paths = data_png_folder + basefile + '/'
            df1, df2, forms_data, text_data, error = textractParse(pdf_paths, png_paths, bucket)

            # if no error is reported we save FORMS, TEXT, DataFrame
            if error is None:

                # store accompanying information for JSONs
                forms_dictionary[basefile] = forms_data
                text_dictionary[basefile]  = text_data
                print(text_data)
                
                # writing data frame to .csv file
                df1.to_csv(fileName, index=False)

                # save contents to AWS S3 bucket
                with open(fileName, 'rb') as data:
                    s3.put_object(Bucket=bucket, Key=output_pdf_folder + fileName, Body=data)
                
                # writing data frame to .csv file extracted from PNG
                if df2 is not None:
                    df2.to_csv(fileName, index=False)
                    
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=output_png_folder + fileName, Body=data)
    
                # remove local file after it has been created
                os.remove(fileName)

                print('-----------------------------------------------------')
                print('Saved {} file to s3 bucket'.format(fileName))
            
            else:
                error_dictionary[basefile] = error
                
        else:
            print('{} has been downloaded'.format(fileName))
    
    # ===========================================================================
    # Save JSON files for updated figures (FORM, TEXT, ERROR)
    # ===========================================================================
    
    # write to a JSON file for FORMS 
    with open('/home/ec2-user/SageMaker/SEC_X17A5/temp/X17A5-FORMS.json', 'w') as file: 
        json.dump(forms_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('/home/ec2-user/SageMaker/SEC_X17A5/temp/X17A5-FORMS.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/X17A5-FORMS.json')
    
    # ---------------------------------------------------------------------------
    
    # write to a JSON file for TEXT 
    with open('/home/ec2-user/SageMaker/SEC_X17A5/temp/X17A5-TEXT.json', 'w') as file: 
        json.dump(text_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('/home/ec2-user/SageMaker/SEC_X17A5/temp/X17A5-TEXT.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/X17A5-TEXT.json')
    
    # ---------------------------------------------------------------------------
    
    # write to a JSON file for FORMS 
    with open('/home/ec2-user/SageMaker/SEC_X17A5/temp/ERROR-TEXTRACT.json', 'w') as file: 
        json.dump(error_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('/home/ec2-user/SageMaker/SEC_X17A5/temp/ERROR-TEXTRACT.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/ERROR-TEXTRACT.json')



Performing OCR for 1101180-2002-02-28.csv
1101180-2002-02-28.csv has been downloaded

Performing OCR for 1101180-2003-02-28.csv
1101180-2003-02-28.csv has been downloaded

Performing OCR for 1101180-2004-02-25.csv
1101180-2004-02-25.csv has been downloaded

Performing OCR for 1101180-2005-03-02.csv
1101180-2005-03-02.csv has been downloaded

Performing OCR for 1101180-2006-03-01.csv
1101180-2006-03-01.csv has been downloaded

Performing OCR for 1101180-2007-03-02.csv
Started job with id: 992e88f537bdde1ae96520051366bc994dc2e3e1a54605cdedece103b7013f3c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Performing OCR for 1101180-2008-02-29.csv
1101180-2008-02-29.csv has been downloaded

Pe

-----------------------------------------------------
Saved 1146184-2013-02-26.csv file to s3 bucket

Performing OCR for 1146184-2014-02-28.csv
Started job with id: 2ed4d3167e8bcd606a766b620191e72718783ac6a482c8a4717c0f0ba76d0903
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Page number(s) for extraction in PNG are [5]

Started job with id: 1e2af55a9cbf259f2fd9b19593b0b98c9bd48297bedb2704b048f0f20c4b4d21
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0           1
0                         

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Performing OCR for 1261467-2021-03-01.csv
1261467-2021-03-01.csv has been downloaded

Performing OCR for 1591458-2015-02-20.csv
1591458-2015-02-20.csv has been downloaded

Performing OCR for 1591458-2016-02-18.csv
Started job with id: b7a194974b0ea9994e3f06873c726846d30a3d743746feda9778f0b1e37709d1
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
R

Started job with id: 3a975bada72df0e0c85960bf4a062a55f74e408a1c31a94b9dea0e46db42fd3c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [10]

Started job with id: 2c43623cbe4f1ce4bd543bd75a008351447b03b2feb6dcef5fba1948caa5fb59
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                                             Assets:              
1                                                Cash     $ 473,865
2            Cash segregated under federal regulation        28,100
3                                 Securities borrowed    25,140,310
4    Collateral held under securitie

Started job with id: a8c91ee6c9746100e5d320bb222e50ea06b60431b640cbe2b9b04a572d2095c8
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Performing OCR for 230611-2011-03-01.csv
230611-2011-03-01.csv has been downloaded

Performing OCR for 230611-2012-02-29.csv
230611-2012-02-29.csv has been downloaded

Performing OCR for 230611-2013-03-01.csv
Started job with id: 689564199e83406ddcb6658a8cfb5237a082f639b7f1213bc7caa38fa336c55b
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS

Job status: IN_PROGRESS
Job status: FAILED
Resultset page recieved: 1

Performing OCR for 356628-1998-02-23.csv
Started job with id: e4fed3ec54c5208104f8e7ba440d06585192877bb411907596879e55e2c2a649
Job status: IN_PROGRESS
Job status: FAILED
Resultset page recieved: 1

Performing OCR for 356628-2002-02-25.csv
356628-2002-02-25.csv has been downloaded

Performing OCR for 356628-2003-02-24.csv
356628-2003-02-24.csv has been downloaded

Performing OCR for 356628-2004-03-01.csv
356628-2004-03-01.csv has been downloaded

Performing OCR for 356628-2005-03-01.csv
356628-2005-03-01.csv has been downloaded

Performing OCR for 356628-2006-03-02.csv
356628-2006-03-02.csv has been downloaded

Performing OCR for 356628-2007-03-01.csv
356628-2007-03-01.csv has been downloaded

Performing OCR for 356628-2008-02-29.csv
356628-2008-02-29.csv has been downloaded

Performing OCR for 356628-2009-03-02.csv
356628-2009-03-02.csv has been downloaded

Performing OCR for 356628-2010-06-11.csv
356628-2010-06-11.

-----------------------------------------------------
Saved 356628-2018-02-23.csv file to s3 bucket

Performing OCR for 356628-2019-02-25.csv
356628-2019-02-25.csv has been downloaded

Performing OCR for 356628-2021-02-25.csv
356628-2021-02-25.csv has been downloaded

Performing OCR for 42352-2002-01-30.csv
42352-2002-01-30.csv has been downloaded

Performing OCR for 42352-2003-01-28.csv
42352-2003-01-28.csv has been downloaded

Performing OCR for 42352-2004-01-27.csv
42352-2004-01-27.csv has been downloaded

Performing OCR for 42352-2005-01-25.csv
42352-2005-01-25.csv has been downloaded

Performing OCR for 42352-2006-01-24.csv
42352-2006-01-24.csv has been downloaded

Performing OCR for 42352-2007-01-23.csv
42352-2007-01-23.csv has been downloaded

Performing OCR for 42352-2008-01-29.csv
42352-2008-01-29.csv has been downloaded

Performing OCR for 42352-2009-01-27.csv
Started job with id: 3ea5c7c165228627ef22830ce8fa327d814f136849503a06155552742d4c3138
Job status: IN_PROGRESS
Job sta

Started job with id: 9477bceea55c426164f764d93b04c5e64879a1d88188e3bfc21d4cb89e7c608c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4

Performing OCR for 42352-2011-03-01.csv
42352-2011-03-01.csv has been downloaded

Performing OCR for 42352-2012-02-29.csv
42352-2012-02-29.csv has been downloaded

Performing OCR for 42352-2013-03-01.csv
42352-2013-03-01.csv has been downloaded

Performing OCR for 42352-2014-07-31.csv
42352-2014-07-31.csv has been downloaded

Performing OCR for 42352-2015-03-10.csv
42352-2015-03-10.csv has been downloaded

Performing OCR for 42352-2016-02-23.csv
42352-2016-02-23.csv has been downloaded

Performing OCR for 42352-2017-03-01.csv
42352-2017-03-01.csv has been downloaded

Performing OCR for 42352-2018-02-27.csv
42352-2018-02-27.csv has 

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0            1
0                                        Net earnings    $ 189,344
1   Adjustments to reconcile net earnings to net c...             
2   Cumulative effect of change in accounting prin...        8,540
3                       Depreciation and amortization        3,452
4                                Deferred tax benefit      (9,463)
5   Increase in cash segregated in compliance with...     (28,423)
6   Increase in securities purchased under agreeme...  (7,134,690)
7   Increase in receivables from broker/dealers an...  (2,671,821)
8              Decrease in receivables from customers       59,000
9   Decrease in trading securities owned, at marke...      755,126
10                Increase in memberships in exchange        (311)
11                    Increase in due from affiliat

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Performing OCR for 754542-2002-03-08.csv
754542-2002-03-08.csv has been downloaded

Performing OCR for 754542-2003-03-03.csv
754542-2003-03-03.csv has been downloaded

Performing OCR for 754542-2004-03-01.csv
754542-2004-03-01.csv has been downloaded

Performing OCR for 754542-2005-03-01.csv
754542-2005-03-01.csv has been downloaded

Performing OCR for 754542-2006-03-01.csv
754542-2006-03-01.csv has been downloaded

Performing OCR for 754542-2007-03-01.csv
754542-2007-03-01.csv has been downloaded

Performing OCR for 754542-2008-02-29.csv
754542-2008-02-29.csv has been downloaded

Performing OCR for 754542-2009-03-02.csv
Started job with id: df3fd97d9137f99809a726324254611e62ac56df1e036fe3f79900d63efa1fec
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEED

-----------------------------------------------------
Saved 803012-2012-01-30.csv file to s3 bucket

Performing OCR for 803012-2012-12-26.csv
803012-2012-12-26.csv has been downloaded

Performing OCR for 803012-2013-12-26.csv
803012-2013-12-26.csv has been downloaded

Performing OCR for 803012-2014-12-30.csv
803012-2014-12-30.csv has been downloaded

Performing OCR for 803012-2015-12-22.csv
803012-2015-12-22.csv has been downloaded

Performing OCR for 803012-2016-12-22.csv
803012-2016-12-22.csv has been downloaded

Performing OCR for 803012-2018-03-22.csv
803012-2018-03-22.csv has been downloaded

Performing OCR for 803012-2018-12-19.csv
Started job with id: d0eb569b1a4b38caac1550b4016890bc5a7c22ad0ca8996fdf97f9c42a8e5dba
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page reciev

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Performing OCR for 808379-2008-02-29.csv
808379-2008-02-29.csv has been downloaded

Performing OCR for 808379-2010-03-01.csv
808379-2010-03-01.csv has been downloaded

Performing OCR for 808379-2011-06-14.csv
808379-2011-06-14.csv has been downloaded

Performing OCR for 808379-2012-03-05.csv
808379-2012-03-05.csv has been downloaded

Performing OCR for 808379-2013-03-01.csv
808379-2013-03-01.csv has been downloaded

Performing OCR for 808379-2014-03-04.csv
808379-2014-03-04.csv has been downloaded

Performing OCR for 808379-2015-03-02.csv
808379-2015-03-02.csv has been downloaded

Performing OCR for 808379-2016-02-29.csv
808379-2016-02-

-----------------------------------------------------
Saved 867626-2002-04-29.csv file to s3 bucket

Performing OCR for 867626-2003-04-25.csv
Started job with id: 617d8bb4e581eb32eb5e6d225dce7a9e0df65e7c7b8902c15532cd742459cc4a
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5

Page number(s) for extraction in PNG are [3]

Started job with id: 4e810d64243f61e6361bf9b184ed98c85a1c939e99f6412bd2ff7e1528204f90
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                1
0                           Cash and cash equivalents        $ 916,177
1   Cash and securities deposited with clearing or.

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [4]

Started job with id: ee113c3919e91609393e73ece81e2532c51cf3f1883c5e039e7d27a213ab8294
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                 1
0                                              ASSETS                  
1      

Started job with id: 2fb42766a3d57bd06a1f7039de88c766e4cc93f1e11a3d390680e991cdac38cb
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [3]

Started job with id: 743b09e654da6fd1be3790356fc55c083783e74c60898a7800675474356d50bb
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                                              ASSETS              
1                           Cash and cash equivalents      $ 73,972
2                    Cash deposited under federal and

-----------------------------------------------------
Saved 867626-2013-02-28.csv file to s3 bucket

Performing OCR for 867626-2014-03-04.csv
867626-2014-03-04.csv has been downloaded

Performing OCR for 867626-2015-02-27.csv
Started job with id: 2ea405c4ceb0342d0d3c87a5e09ebca09e0fcc5651da9640e1b13255b4f048b2
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [6]

Started job with id: a6c6e57d80f03ad0491a55e9d4d07420ce93d4015251e015fa2d4100e5324ea4
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job stat

-----------------------------------------------------
Saved 867626-2015-02-27.csv file to s3 bucket

Performing OCR for 867626-2016-02-26.csv
867626-2016-02-26.csv has been downloaded

Performing OCR for 867626-2017-02-28.csv
867626-2017-02-28.csv has been downloaded

Performing OCR for 867626-2018-02-27.csv
867626-2018-02-27.csv has been downloaded

Performing OCR for 867626-2019-02-27.csv
867626-2019-02-27.csv has been downloaded

Performing OCR for 867626-2020-02-27.csv
867626-2020-02-27.csv has been downloaded

Performing OCR for 867626-2021-02-26.csv
867626-2021-02-26.csv has been downloaded

Performing OCR for 874362-2002-01-29.csv
874362-2002-01-29.csv has been downloaded

Performing OCR for 874362-2003-01-30.csv
874362-2003-01-30.csv has been downloaded

Performing OCR for 874362-2004-01-30.csv
874362-2004-01-30.csv has been downloaded

Performing OCR for 874362-2005-01-31.csv
874362-2005-01-31.csv has been downloaded

Performing OCR for 874362-2006-01-30.csv
874362-2006-01-30.

-----------------------------------------------------
Saved 87634-2004-02-27.csv file to s3 bucket

Performing OCR for 87634-2005-02-28.csv
87634-2005-02-28.csv has been downloaded

Performing OCR for 87634-2006-02-28.csv
87634-2006-02-28.csv has been downloaded

Performing OCR for 87634-2006-09-21.csv
87634-2006-09-21.csv has been downloaded

Performing OCR for 87634-2007-03-01.csv
87634-2007-03-01.csv has been downloaded

Performing OCR for 87634-2008-03-03.csv
87634-2008-03-03.csv has been downloaded

Performing OCR for 87634-2009-02-27.csv
87634-2009-02-27.csv has been downloaded

Performing OCR for 87634-2010-03-01.csv
87634-2010-03-01.csv has been downloaded

Performing OCR for 87634-2011-02-28.csv
87634-2011-02-28.csv has been downloaded

Performing OCR for 87634-2012-02-27.csv
Started job with id: a10e9af478dae85badaafd72d525119d430f73b2f6c4d6fb66297e5a12b0fb5c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRES

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2

Performing OCR for 890203-2011-05-31.csv
890203-2011-05-31.csv has been downloaded

Performing OCR for 890203-2012-05-30.csv
890203-2012-05-30.csv has been downloaded

Performing OCR for 890203-2013-05-29.csv
890203-2013-05-29.csv has been downloaded

Performing OCR for 890203-2014-02-28.csv
890203-2014-02-28.csv has been downloaded

Performing OCR for 890203-2015-03-02.csv
890203-2015-03-02.csv has been downloaded

Performing OCR for 890203-2016-02-26.csv
890203-2016-02-26.csv has been downloaded

Performing OCR for 890203-2017-03-15.csv
890203-2017-03-15.csv has been downloaded

Performing OCR for 890203-2018-03-01.csv
890203-2018-03-01.csv has been downloaded

Performing OCR for 890203-2019-03-07.csv
890203-2019-03-07.csv has been downloaded


{'RBC Dain Rauscher Inc.': 99.86534881591797, 'Statements of Financial Condition': 99.96200561523438, 'December 31, 2001 and 2000': 99.80337524414062, '(In Thousands, except share information)': 99.94025421142578, 'ASSETS': 99.81194305419922, '2001': 99.93791198730469, '2000': 99.85255432128906, 'Cash and cash equivalents': 99.94498443603516, '$': 95.35848999023438, '266,729': 99.89884185791016, '42,278': 99.84497833251953, 'Receivable from customers': 99.91924285888672, '1,056,882': 99.75092315673828, '1,432,317': 99.33039855957031, 'Receivable from brokers, dealers and clearing organizations': 98.96827697753906, '273,600': 98.49810028076172, '198,742': 98.87074279785156, 'Securities purchased under agreements to resell': 99.93888854980469, '163,155': 98.1037826538086, '36,542': 99.49396514892578, 'Trading securities owned, at market value': 99.73739624023438, '392,554': 99.6880874633789, '329,327': 99.57742309570312, 'Equipment and leasehold improvements, at cost,': 99.45893859863281

-----------------------------------------------------
Saved 89562-2005-01-31.csv file to s3 bucket

Performing OCR for 89562-2006-01-30.csv
Started job with id: 80ef1609d74fa89941303d7b783dc80820440d74a6521458aa90fac9567e3166
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8

Page number(s) for extraction in PNG are [5, 6]

Started job with id: 8b07bcb58f9483c04a58a6e025729b4f75fd0e66714beb5ef697b153fb0a2d15
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Started job with id: 8434e03d0b2d556c6e6db623329e6072a2e66f1b5d25d9b2ee3cdb55d5f958f5
Job status: IN_PROGRESS
J

-----------------------------------------------------
Saved 89562-2006-01-30.csv file to s3 bucket

Performing OCR for 89562-2007-01-29.csv
Started job with id: 0fc59d29f593e5d20f6b4bb110129d4a0c4943eb6cf169392d126b9ccc75a8a4
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8

Page number(s) for extraction in PNG are [5, 6]

Started job with id: 5e578567fb15cdd20ab6aca44c558d5a4ed0b013daf5be034e37c6b806275597
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_P

-----------------------------------------------------
Saved 89562-2007-01-29.csv file to s3 bucket

Performing OCR for 89562-2008-01-29.csv
Started job with id: 4a3b027cc148f1db634f255d4d4b2b74d17633a433c30961dea2f8e12370a8cb
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8
Resultset page recieved: 9

Page number(s) for extraction in PNG are [5]

Started job with id: b95da97f5d83e27524db5571255f56fb5d12cf7856397d9e6fc04afbe5215f46
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                  1
0                   In millions, excep

-----------------------------------------------------
Saved 89562-2008-01-29.csv file to s3 bucket

Performing OCR for 91154-2002-03-01.csv
91154-2002-03-01.csv has been downloaded

Performing OCR for 91154-2003-03-03.csv
91154-2003-03-03.csv has been downloaded

Performing OCR for 91154-2004-02-27.csv
91154-2004-02-27.csv has been downloaded

Performing OCR for 91154-2005-03-01.csv
91154-2005-03-01.csv has been downloaded

Performing OCR for 91154-2006-03-01.csv
Started job with id: c7a50a08ae972f405dd57d28d26c4a5c0d4359cc9bd45286d16f045110dd0010
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8

Performing OCR 

-----------------------------------------------------
Saved 91154-2015-03-02.csv file to s3 bucket

Performing OCR for 91154-2016-03-01.csv
91154-2016-03-01.csv has been downloaded

Performing OCR for 91154-2017-03-02.csv
91154-2017-03-02.csv has been downloaded

Performing OCR for 91154-2018-03-05.csv
91154-2018-03-05.csv has been downloaded

Performing OCR for 91154-2019-03-05.csv
91154-2019-03-05.csv has been downloaded

Performing OCR for 91154-2020-03-02.csv
91154-2020-03-02.csv has been downloaded

Performing OCR for 920417-2015-03-02.csv
920417-2015-03-02.csv has been downloaded

Performing OCR for 920417-2016-02-29.csv
920417-2016-02-29.csv has been downloaded

Performing OCR for 920417-2017-03-01.csv
920417-2017-03-01.csv has been downloaded

Performing OCR for 920417-2018-02-27.csv
920417-2018-02-27.csv has been downloaded

Performing OCR for 920417-2019-02-28.csv
920417-2019-02-28.csv has been downloaded

Performing OCR for 920417-2020-02-28.csv
920417-2020-02-28.csv has bee

-----------------------------------------------------
Saved 922792-2011-03-01.csv file to s3 bucket

Performing OCR for 922792-2012-02-29.csv
922792-2012-02-29.csv has been downloaded

Performing OCR for 922792-2014-03-04.csv
922792-2014-03-04.csv has been downloaded

Performing OCR for 922792-2015-03-02.csv
922792-2015-03-02.csv has been downloaded

Performing OCR for 922792-2016-02-29.csv
922792-2016-02-29.csv has been downloaded

Performing OCR for 922792-2017-03-01.csv
922792-2017-03-01.csv has been downloaded

Performing OCR for 922792-2018-03-01.csv
922792-2018-03-01.csv has been downloaded

Performing OCR for 922792-2019-02-28.csv
922792-2019-02-28.csv has been downloaded

Performing OCR for 922792-2019-03-01.csv
922792-2019-03-01.csv has been downloaded

Performing OCR for 922792-2020-02-28.csv
922792-2020-02-28.csv has been downloaded

Performing OCR for 922792-2021-02-26.csv
922792-2021-02-26.csv has been downloaded


In [36]:
# # single reading for testing purposes and debugging Textract results e.g. 853784-2002-03-01
# textractParse('Input/X-17A-5-PDF-SUBSETS/89562-2005-01-31-subset.pdf', 
#               'Input/X-17A-5-PNG-SUBSETS/89562-2005-01-31/', 'ran-s3-systemic-risk')

Started job with id: 4fbf117fc4b905d0524fccd453dd5c70949261df4147dc4bcebbfac78ea26b76
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Jo

(                                                    0         1
 0                                              Assets          
 1                           Cash and cash equivalents     $ 924
 2   Cash and securities segregated and on deposit ...     1,633
 3   Securities and other inventory positions owned...    80,575
 4                   Securities received as collateral     1,202
 5                          Collateralized agreements:          
 6     Securities purchased under agreements to resell    64,168
 7                                 Securities borrowed    94,818
 8                                        Receivables:          
 9         Brokers, dealers and clearing organizations     5,066
 10                                          Customers     8,934
 11                                         Affiliates    15,748
 12                                             Others       147
 13  Property, equipment and leasehold improvements...       168
 14                      