In [1]:
# Run on first instance to install required libraries
%pip install smart_open
%pip install minecart
%pip install textract-trp

Collecting smart_open
  Downloading smart_open-5.0.0-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 2.8 MB/s eta 0:00:011
[?25hInstalling collected packages: smart-open
Successfully installed smart-open-5.0.0
Note: you may need to restart the kernel to use updated packages.
Collecting minecart
  Downloading minecart-0.3.0-py3-none-any.whl (23 kB)
Collecting pdfminer3k
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 5.1 MB/s ta 0:00:011
Installing collected packages: pdfminer3k, minecart
Successfully installed minecart-0.3.0 pdfminer3k-1.3.4
Note: you may need to restart the kernel to use updated packages.
Collecting textract-trp
  Downloading textract_trp-0.1.3-py3-none-any.whl (5.8 kB)
Installing collected packages: textract-trp
Successfully installed textract-trp-0.1.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import time 
import re
import os
import trp
import boto3
import minecart
import json
import logging 

import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session

# AWS Asynchronous Textract Script (requesting Job)
**Content modified from Amazon AWS Textract repository (refer to [URL](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py) below)** 

In [3]:
def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    # initialize return and client object
    response = None                         
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,     # location of data to be read from s3 bucket 
                'Name': objectName}},       # file name to be read from Textract  
        FeatureTypes=['FORMS', 'TABLES']    # selecting FORMS (key-values) and TABLES from the OCR
    )
    
    # return response job ID for service
    return response["JobId"]

In [4]:
def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when queued
    """
    # allow for interal sleep timer (efficiency)
    time.sleep(1)                               
    
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    # job-status of the response object 
    status = response["JobStatus"]                        
    print("Job status: {}".format(status))
    
    # if job still running check current status every 5 seconds
    while(status == "IN_PROGRESS"):
        
        # time lag before reporting status
        time.sleep(5)                                         
        response = client.get_document_analysis(JobId=jobId)
        
        # job-status of the response object
        status = response["JobStatus"]                        
        print("Job status: {}".format(status))
    
    return status

In [5]:
def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after job status is completed
    """
    # initialize list object to track pages read
    pages = []                    

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    # add first page response to list (length of pages will be arbitrary) 
    pages.append(response)      
    print("Resultset page recieved: {}".format(len(pages)))
    
    # if NextToken present we have a pointer to page (e.g. Response -> Page) 
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure (assuming nextToken not None)
    while(nextToken):
        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        
        # move along linked-list for presence of NextToken response
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']
    
    # return amalgamation of all page responses 
    return pages

In [6]:
def runJob(bucket:str, key:str):
    """
    Function designed to call an AWS Textract job (implements helper function above)
    """
    jobId = startJob(bucket, key)   
    print("Started job with id: {}".format(jobId))

    # if job is complete on AWS return page responses 
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
        
    return response

# AWS Extraction Scripts (Key-Value Pairs)
**The content was modified from AWS to extract key-value pairs in form documents from Block objects that are stored in a map. (refer to [URL](https://docs.aws.amazon.com/textract/latest/dg/examples-extract-kvp.html))**

In [7]:
def find_value_block(key_block, value_map):
    """
    Retrieving value block from AWS textract job, this contains the value text 
    """
    # iterate through the key blocks in the FORM relationships (should have a VALUE and CHILD type, n=2)
    for relationship in key_block['Relationships']:
        
        # if our key block object type is a VALUE we examine the relationship ID
        # NOTE WE SHOULD HAVE ONLY ONE ID FOR THE VALUE RELATIONSHIP TYPE
        if relationship['Type'] == 'VALUE':
            
            # singular ID item stored in list object (return value block object)
            for value_id in relationship['Ids']:
                value_block = value_map[value_id]
            
    # return all corresponding value series
    return value_block

In [8]:
def get_kv_relationship(key_map, value_map, block_map):
    """
    Retrieving the Key-Value relationship from FORM OCR Textract 
    """
    # initialize key-map dictionary for lineitems and corresponding accounting values
    key_value_map = {}
    
    # unpack the key_map to retrieve the block id and key names
    for block_id, key_block in key_map.items():

        # retrieve value block provided the key_block from each block id
        value_block = find_value_block(key_block, value_map)

        # get text value from key and value blocks
        key = get_text(key_block, block_map)
        val = get_text(value_block, block_map)
        
        # map the key and value pairs (e.g. {'Total Assets':'$ 189,232'})
        key_value_map[key] = val
        
    return key_value_map

In [9]:
def get_text(result, blocks_map):
    """
    Retrieving text values from given block object
    """
    # initialize container for text
    text = ''
    
    # if relationships header exists we can extract CHILD header
    if 'Relationships' in result:
        
        # relationship maps to a list (iterate through to reveal a dictionary)
        # e.g. 'Relationships' : [{'Type' : 'CHILD', 'Ids': ['e2b3b12f-ebb7-4f6e-914f-97b315672530']}]
        for relationship in result['Relationships']:
            
            # if relationship type is CHILD we explore job-id (indicates good fit)
            if relationship['Type'] == 'CHILD':
                
                # iterate through Ids list
                for child_id in relationship['Ids']:
                    
                    # select corresponding CHILD_ID from block map, this is sub-dictionary
                    word = blocks_map[child_id]
                    
                    # if block type is a word then we append with a space
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                        
                    # if block type is a selection element (e.g. an option button/mark)
                    # note we treat these cases with an X to denote an optional field 
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] == 'SELECTED':
                            text += 'X '    
    
    # return string corresponding with word 
    return text

# OCR Wrapper Functions
**The scripts perform an OCR job from AWS Textract, and returning well formated data**

In [10]:
def trp2df(table:trp.Table) -> pd.DataFrame:
    """
    Function designed to convert a trp table into a dataframe Complexity -> O(n^2) approx.
    ------------------------------------------------------------------------------------------
    Input
        :param table: (type trp.Table)
            A trp table object parsed from a pdf using AWS Textract  
    
    Output
        :return: type pandas.DataFrame
            A DataFrame object that is constructed by deconstructed a Textract trp table
    """
    N = len(table.rows)               # number of rows in table
    M = len(table.rows[0].cells)      # number of columns in table
    arr = [0]*N                       # initialize matrix container
    
    # iterate through each row within the provided table
    for row in np.arange(N):
        
        # strip the text from the cell references to construct (N X M) matrix
        arr[row] = [table.rows[row].cells[col].text.strip() for col in np.arange(M)]    # move column-wise to get text
        
    return pd.DataFrame(arr)

In [11]:
def readTable(response:list) -> tuple:
    """
    Function to transform AWS Textract object to a dataframe, by searching for tables
     ------------------------------------------------------------------------------------------
    Input
        :param response: (type list)
            An AWS Textract response object corresponding to pages of a given document page 
    
    Output
        :return: type tuple
            A (3x1) tuple is returned, storing the concated dataframe at the first index, and the accompanying 
            trp page objects for where the balance sheet was determined to reside at the second index
    """
    
    catDF = []          # in the event multiple tables detected on one page (concat them)
    page_series = []    # keep track of page objects where balance sheet was flagged
    page_nums = []
    page_count = 0
    
    # format the Textract response type 
    doc = trp.Document(response)
    
    # iterate through document pages
    for page in doc.pages:
        
        # itterate through page tables
        for table in page.tables: 
            
            # convert trp-table into dataframe object
            df = trp2df(table)
            
            # remove columns that are completely empty
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
  
            # number of columns in dataframe
            n = df.columns.size
            
            # reset the column names (avoid the column names)
            df.columns = np.arange(n)
            
            ##############################################################
            #                           NOTES
            #          a good dataframe should have 2-3 columns
            #      anything more or less is a reading error we ignore
            ##############################################################
            
            # if the dataframe has more than 3 columns then we most likley have an issue in parsing
            if n > 3:
                return None
            
            elif n > 1:
                
                ##############################
                # Balance Sheet Assummptions
                ##############################
                
                # this is the column with all line items (e.g. Cash, Total Assets, Total Liabilites)
                lineIndex = df.columns[0]

                # check for the word "cash" in a string at the begining, ignoring case sensitivity (asset check)
                assetCheck = df[lineIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

                # check for the word "Liabilities" in a string at the end, ignoring case sensitivity (liability check)
                debtCheck1 = df[lineIndex].str.contains('Liabilities$|^Liabilities', regex=True, flags=re.IGNORECASE)
                debtCheck2 = df[lineIndex].str.contains('Liability$|^Liability', regex=True, flags=re.IGNORECASE)
                
                # check for the presence of $ sign, we assume the balance sheet items should have at least one $ sign
                # this check is used to avoid reading the table of contents, which was flagged in prior reads
                dollarCheck = df[df.columns[1]].str.contains('\$[^\]]+', regex=True, flags=re.IGNORECASE)
                
                ##############################
                # Balance Sheet Determination
                ##############################
                
                # check if the key words have been found 
                check1 = df[assetCheck | debtCheck1 | debtCheck2].empty      # check for line item terms
                check2 = df[dollarCheck == True].empty                       # check for presence of '$' sign  
                check3 = df[debtCheck1 == True].empty                        # debt check for Liabilities
                check4 = df[debtCheck2 == True].empty                        # debt check for Liability 
                
                # if either asset term or liability term is found, with a $ sign we append the dataframe
                if not check1 and not check2:
                    
                    # we append pages since asset and liablility tables are often seperate
                    # there is no loss of generality if asset and liability terms are in one table
                    catDF.append(df)                
                    
                    # we want to keep track of pages that have been deemed as balance sheet
                    if page not in page_series:
                        page_series.append(page)   # only append if page isn't already recorded
                        page_nums.append(page_count)
                        
                    if not check3 or not check4:
                        # if liability table was found on the first iteration we simply concat data frames 
                        return (pd.concat(catDF), page_series, page_nums)
                    
        page_count += 1
        

In [40]:
def readPNG(pages:list, png_path:str, bucket='ran-s3-systemic-risk') -> tuple:
    """
    Function to transform AWS Textract object to a dataframe, by searching for tables
     ------------------------------------------------------------------------------------------
    Input
        :param response: (type list)
            
    
    Output
        :return: type tuple
          
    """
    subfolder = png_path.split('/')[-2]      # subfolder where PNG files are stored
    
    # construct PNG directories with relevant pages
    textract_paths = [png_path + subfolder + '-p{}.png'.format(idx) for idx in pages]
    
    catDF = []          # in the event multiple pages we concat them
    
    for path in textract_paths:
        
        try:
            # temporary data frame object for balance sheet information
            res = runJob(bucket, path)
            
            # if Textract job did not fail we continue extraction
            if res[0]['JobStatus'] != 'FAILED':

                # format the Textract response type 
                doc = trp.Document(res)

                # iterate through document pages
                for page in doc.pages:
                    
                    # itterate through page tables
                    for table in page.tables: 

                        # convert trp-table into dataframe object
                        df = trp2df(table)
                        
                        # remove columns that are completely empty
                        empty_cols = [col for col in df.columns if (df[col] == '').all()]
                        df = df.drop(empty_cols, axis=1)

                        # number of columns in dataframe
                        n = df.columns.size

                        # reset the column names (avoid the column names)
                        df.columns = np.arange(n)
                        
                        ##############################################################
                        #                           NOTES
                        #          a good dataframe should have 2-3 columns
                        #      anything more or less is a reading error we ignore
                        ##############################################################

                        # if the dataframe has more than 3 columns then we most likley have an issue in parsing
                        if n > 3:
                            return None

                        elif n > 1:

                            ##############################
                            # Balance Sheet Assummptions
                            ##############################

                            # this is the column with all line items (e.g. Cash, Total Assets, Total Liabilites)
                            lineIndex = df.columns[0]

                            # check for the word "cash" in a string at the begining, ignoring case sensitivity 
                            assetCheck = df[lineIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

                            # check for the word "Liabilities" in a string at the end, ignoring case sensitivity 
                            debtCheck1 = df[lineIndex].str.contains('Liabilities$|^Liabilities', 
                                                                    regex=True, flags=re.IGNORECASE)
                            debtCheck2 = df[lineIndex].str.contains('Liability$|^Liability', 
                                                                    regex=True, flags=re.IGNORECASE)

                            # check for the presence of $ sign, we assume the balance sheet items should have 
                            # this check is used to avoid reading the table of contents, which was flagged in prior reads
                            dollarCheck = df[df.columns[1]].str.contains('\$[^\]]+', regex=True, flags=re.IGNORECASE)

                            ##############################
                            # Balance Sheet Determination
                            ##############################

                            # check if the key words have been found 
                            check1 = df[assetCheck | debtCheck1 | debtCheck2].empty      # check for line item terms
                            check2 = df[dollarCheck == True].empty                       # check for presence of '$' sign  
                            check3 = df[debtCheck1 == True].empty                        # debt check for Liabilities
                            check4 = df[debtCheck2 == True].empty                        # debt check for Liability 

                            # if either asset term or liability term is found, with a $ sign we append the dataframe
                            if not check1 and not check2:

                                # we append pages since asset and liablility tables are often seperate
                                # there is no loss of generality if asset and liability terms are in one table
                                catDF.append(df)                

                                if not check3 or not check4:
                                    # if liability table was found on the first iteration we simply concat data frames 
                                    return pd.concat(catDF)
        
        # broad exeption to catch Textract parsing errors
        except:
            pass
    
    # default return None
    return None

In [28]:
def readForm(doc_pages:list) -> dict:
    """
    Function to transform AWS Textract object to a dictionary, by searching for key value pairs
    ------------------------------------------------------------------------------------------
    Input
        :param doc_pages: (type list)
            TRP page(s) for a AWS Textract response object corresponding to pages of a given document page 
    
    Output
        :return: type dict
            A python dictionary that maps KEYS (line items) with VALUES (corresponding records) for broker
            dealers balance sheet (e.g. {'Cash and cash equivalents : $ 12,513})
    """
    
    # initializing dictionary maps for KEY and VALUE pairs
    key_map = {}
    value_map = {}
    block_map = {}

    # iterate through document pages
    for page in doc_pages:

        # itterate through page tables
        for block in page.blocks: 

            # store the block id in map to retrive information later
            block_id = block['Id']
            block_map[block_id] = block

            # if Key-value set has been seen we deconstruct each KEY and VALUE map
            if block['BlockType'] == "KEY_VALUE_SET":

                # if KEY is labeled as entity type then we found Key, else we found VALUE
                if 'KEY' in block['EntityTypes']:
                    key_map[block_id] = block
                else:
                    value_map[block_id] = block
    
    # convert block objects to text dictionary map
    return get_kv_relationship(key_map, value_map, block_map)

In [29]:
def readText(doc_pages:list) -> dict:
    """
    Function to transform AWS Textract object to a dictionary of text values and confidence 
    ------------------------------------------------------------------------------------------
    Input
        :param doc_pages: (type list)
            TRP page(s) for a AWS Textract response object corresponding to pages of a given document page
    
    Output
        :return: type dict
            A python dictionary that maps TEXT (line items) with corresponding confidence figures as reported
            by AWS Textract object (e.g. {'Cash and cash equivalents : 99.97891})
    """
    # initializing dictionary maps for text
    text_map = {}
    
    # iterate through document pages
    for page in doc_pages:
        
        # itterate through page tables
        for block in page.blocks: 
            
            # if our block type is a line, we map the line text and confidence
            if block['BlockType'] == "LINE":
                text_map[block['Text']] = block['Confidence']
    
    # return completed text to confidence map
    return text_map

## Extract Balance Sheet information

In [30]:
def textractParse(pdf_path:str, png_path:str, bucket:str) -> dict:
    """
    Function runs a Textract job and saves Balance Sheet information to .csv file in s3 folder 
    """
    errors = ''
    
    # temporary data frame object for balance sheet information
    res = runJob(bucket, pdf_path)
    
    # if Textract job did not fail we continue extraction
    if res[0]['JobStatus'] != 'FAILED':

        # perform OCR and return balance sheet with corresponding page object(s)
        tb_response = readTable(res)           
        
        # checks for type of return, if none then we log an error
        if type(tb_response) == tuple:
            
            # deconstruct the table response tuple into dataframe and page object parts
            df1, page_obj, page_num = tb_response
            print('\nPage number(s) for extraction in PNG are {}\n'.format(page_num))
            
            # try to extract from a PNG (we can still return a None here)
            df2 = readPNG(page_num, png_path)
            
            # provided balance sheet page number we select FORM and TEXT data
            forms_data = readForm(page_obj)      
            text_data = readText(page_obj)        
            
            print('\nTextract-PDF dataframe')
            print(df1)
            
            print('\nTextract-PNG dataframe')
            print(df2)
            
            return (df1, df2, forms_data, text_data, None)
        else:
            error = 'No Balance Sheet found, or parsing error'
            return (None, None, None, None, error)
    else:
        error = 'Could not parse, JOB FAILED'
        return (None, None, None, None, error)

## Main File Execution

In [23]:
if __name__ == "__main__":

    # Amazon Textract client and Sagemaker session
    textract = boto3.client('textract')
    s3 = boto3.client('s3')
    session = Session()
    
    # initiate s3 bucket and corresponding data/output folder
    bucket = 'ran-s3-systemic-risk'
    
    data_png_folder = 'Input/X-17A-5-PNG-SUBSETS/'
    data_pdf_folder = 'Input/X-17A-5-PDF-SUBSETS/'
    
    output_png_folder = 'Output/X-17A-5-PNG-RAW/'
    output_pdf_folder = 'Output/X-17A-5-PDF-RAW/'
    
    temp_folder = 'Temp/'
    
    # csv directory where we store balance sheet information 
    output_png_csvs = np.array(session.list_s3_files(bucket, output_png_folder))
    output_pdf_csvs = np.array(session.list_s3_files(bucket, output_pdf_folder))
    
    # temp directory where JSON files is stored
    temp = np.array(session.list_s3_files(bucket, temp_folder))
    
    # pdf directory where we store the broker-dealer information 
    pdf_files = np.array(session.list_s3_files(bucket, data_pdf_folder))[1:]
    png_files = np.array(session.list_s3_files(bucket, data_png_folder))[1:]
    png_file_directory = list(set((map(lambda x: '/'.join(x.split('/')[:-1]), png_files))))
    
    
    # ===========================================================================
    # Load in Temp JSON files if present (FORM, TEXT, ERROR)
    # ===========================================================================
    
    if 'Temp/X17A5-FORMS.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/X17A5-FORMS.json', 'temp1.json')
        
        # read data on KEY-VALUE dictionary (i.e Textract FORMS) 
        with open('temp1.json', 'r') as f: forms_dictionary = json.loads(f.read())
        
        # remove local files for JSON
        os.remove('temp1.json')
    else:
        forms_dictionary = {}
        
    if 'Temp/X17A5-TEXT.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/X17A5-TEXT.json', 'temp2.json')
        
        # read data on TEXT-Confidence dictionary
        with open('temp2.json', 'r') as f: text_dictionary = json.loads(f.read())  
            
        # remove local files for JSON
        os.remove('temp2.json')
    else:
        text_dictionary = {}
        
    if 'Temp/ERROR-TEXTRACT.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/ERROR-TEXTRACT.json', 'temp3.json')
        
        # read data on errors derived from Textract
        with open('temp3.json', 'r') as f: error_dictionary = json.loads(f.read()) 
            
        # remove local files for JSON
        os.remove('temp3.json')
    else:
        error_dictionary = {}
    
    
    # ===========================================================================
    # Perform Textract analysis on PDFs and PNGs
    # ===========================================================================
    
    # e.g. ['Input/X-17A-5-PDF-SUBSETS/42352-2012-02-29-subset.pdf'] otherwise pdf_files (full sample)
    
    select_sample = ['Input/X-17A-5-PDF-SUBSETS/782124-2014-03-05-subset.pdf', 
                     'Input/X-17A-5-PDF-SUBSETS/853784-2003-02-28-subset.pdf',
                     'Input/X-17A-5-PDF-SUBSETS/853784-2004-03-01-subset.pdf',
                     'Input/X-17A-5-PDF-SUBSETS/853784-2005-02-28-subset.pdf']

    for pdf_paths in select_sample:
        
        # baseFile name to name export .csv file e.g. 1224385-2004-03-01.csv
        basefile = pdf_paths.split('/')[-1].split('-subset')[0]
        fileName = basefile + '.csv'
        print('\nPerforming OCR for {}'.format(fileName))
        
        # if file is not found in directory we extract the balance sheet
        # WE LOOK TO AVOID RE-RUNNING OLD TEXTRACT PARSES TO SAVE TIME
        if (output_pdf_folder + fileName not in output_pdf_csvs) or True:
            
            # run Textract OCR job and extract the parsed data 
            png_paths = data_png_folder + basefile + '/'
            df1, df2, forms_data, text_data, error = textractParse(pdf_paths, png_paths, bucket)

            # if no error is reported we save FORMS, TEXT, DataFrame
            if error is None:

                # store accompanying information for JSONs
                forms_dictionary[basefile] = forms_data
                text_dictionary[basefile]  = text_data
                print(text_data)
                
                # writing data frame to .csv file
                df1.to_csv(fileName, index=False)

                # save contents to AWS S3 bucket
                with open(fileName, 'rb') as data:
                    s3.put_object(Bucket=bucket, Key=output_pdf_folder + fileName, Body=data)
                
                # writing data frame to .csv file extracted from PNG
                if df2 is not None:
                    df2.to_csv(fileName, index=False)
                    
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=output_png_folder + fileName, Body=data)
    
                # remove local file after it has been created
                os.remove(fileName)

                print('-----------------------------------------------------')
                print('Saved {} file to s3 bucket'.format(fileName))
            
            else:
                error_dictionary[basefile] = error
                
        else:
            print('{} has been downloaded'.format(fileName))
    
    
    # ===========================================================================
    # Save JSON files for updated figures (FORM, TEXT, ERROR)
    # ===========================================================================
    
    # write to a JSON file for FORMS 
    with open('X17A5-FORMS.json', 'w') as file: 
        json.dump(forms_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('X17A5-FORMS.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/X17A5-FORMS.json')
    
    # ---------------------------------------------------------------------------
    
    # write to a JSON file for TEXT 
    with open('X17A5-TEXT.json', 'w') as file: 
        json.dump(text_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('X17A5-TEXT.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/X17A5-TEXT.json')
    
    # ---------------------------------------------------------------------------
    
    # write to a JSON file for FORMS 
    with open('ERROR-TEXTRACT.json', 'w') as file: 
        json.dump(error_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('ERROR-TEXTRACT.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/ERROR-TEXTRACT.json')
    
    # remove local files for JSON
    os.remove('X17A5-FORMS.json')
    os.remove('X17A5-TEXT.json')
    os.remove('ERROR-TEXTRACT.json')
        


Performing OCR for 782124-2014-03-05.csv
Started job with id: 1cee7d490799b332da5c97f7bbb5fefe87c96b40c71666bb79706672be50f79a
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8
Resultset page recieved: 9
Resultset page recieved: 10
Started job with id: 298809202ce194137671098d2d02334af15633aa47490662480e3b0b37f4d494
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0  \
0                                              Assets   
1                                                Cash   
2   Cash and securities segregate

-----------------------------------------------------
Saved 782124-2014-03-05.csv file to s3 bucket

Performing OCR for 853784-2003-02-28.csv
Started job with id: 6f7d6f20036544686c55cc7d3825af8ed3c9e7a49464a4d5069bf07fdfe123f4
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job 

Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8
Started job with id: 4bee2f158049470ec69292dcf0eec269a12eb1f11028f505b769def187fdf3d7
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                LIABILITIES AND STOCKHOLDER'S EQUITY              
1                                        Liabilities:              
2                               Short term borrowings     $ 167,507
3      Securities sold under agreements to repurchase     5,763,412
4   Securities sold, not yet purchased, at market ...       443,056
5  

-----------------------------------------------------
Saved 853784-2005-02-28.csv file to s3 bucket


In [41]:
# # single reading for testing purposes and debugging Textract results
# # e.g. 853784-2002-03-01
# textractParse('Input/X-17A-5-PDF-SUBSETS/853784-2003-02-28-subset.pdf', 
#               'Input/X-17A-5-PNG-SUBSETS/853784-2003-02-28/', 'ran-s3-systemic-risk')