In [1]:
# Run on first instance to install required libraries
%pip install smart_open minecart textract-trp

Note: you may need to restart the kernel to use updated packages.


In [2]:
import time 
import re
import os
import trp
import boto3
import minecart
import json
import logging 

import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session

# AWS Asynchronous Textract Script (requesting Job)
**Content modified from Amazon AWS Textract repository (refer to [URL](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py) below)** 

In [3]:
def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    # initialize return and client object
    response = None                         
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,     # location of data to be read from s3 bucket 
                'Name': objectName}},       # file name to be read from Textract  
        FeatureTypes=['FORMS', 'TABLES']    # selecting FORMS (key-values) and TABLES from the OCR
    )
    
    # return response job ID for service
    return response["JobId"]

In [4]:
def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when queued
    """
    # allow for interal sleep timer (efficiency)
    time.sleep(1)                               
    
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    # job-status of the response object 
    status = response["JobStatus"]                        
    print("Job status: {}".format(status))
    
    # if job still running check current status every 5 seconds
    while(status == "IN_PROGRESS"):
        
        # time lag before reporting status
        time.sleep(5)                                         
        response = client.get_document_analysis(JobId=jobId)
        
        # job-status of the response object
        status = response["JobStatus"]                        
        print("Job status: {}".format(status))
    
    return status

In [5]:
def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after job status is completed
    """
    # initialize list object to track pages read
    pages = []                    

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    # add first page response to list (length of pages will be arbitrary) 
    pages.append(response)      
    print("Resultset page recieved: {}".format(len(pages)))
    
    # if NextToken present we have a pointer to page (e.g. Response -> Page) 
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure (assuming nextToken not None)
    while(nextToken):
        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        
        # move along linked-list for presence of NextToken response
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']
    
    # return amalgamation of all page responses 
    return pages

In [6]:
def runJob(bucket:str, key:str):
    """
    Function designed to call an AWS Textract job (implements helper function above)
    """
    jobId = startJob(bucket, key)   
    print("Started job with id: {}".format(jobId))

    # if job is complete on AWS return page responses 
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
        
    return response

# AWS Extraction Scripts (Key-Value Pairs)
**The content was modified from AWS to extract key-value pairs in form documents from Block objects that are stored in a map. (refer to [URL](https://docs.aws.amazon.com/textract/latest/dg/examples-extract-kvp.html))**

In [7]:
def find_value_block(key_block, value_map):
    """
    Retrieving value block from AWS textract job, this contains the value text 
    """
    # iterate through the key blocks in the FORM relationships (should have a VALUE and CHILD type, n=2)
    for relationship in key_block['Relationships']:
        
        # if our key block object type is a VALUE we examine the relationship ID
        # NOTE WE SHOULD HAVE ONLY ONE ID FOR THE VALUE RELATIONSHIP TYPE
        if relationship['Type'] == 'VALUE':
            
            # singular ID item stored in list object (return value block object)
            for value_id in relationship['Ids']:
                value_block = value_map[value_id]
            
    # return all corresponding value series
    return value_block

In [8]:
def get_kv_relationship(key_map, value_map, block_map):
    """
    Retrieving the Key-Value relationship from FORM OCR Textract 
    """
    # initialize key-map dictionary for lineitems and corresponding accounting values
    key_value_map = {}
    
    # unpack the key_map to retrieve the block id and key names
    for block_id, key_block in key_map.items():

        # retrieve value block provided the key_block from each block id
        value_block = find_value_block(key_block, value_map)

        # get text value from key and value blocks
        key = get_text(key_block, block_map)
        val = get_text(value_block, block_map)
        
        # map the key and value pairs (e.g. {'Total Assets':'$ 189,232'})
        key_value_map[key] = val
        
    return key_value_map

In [9]:
def get_text(result, blocks_map):
    """
    Retrieving text values from given block object
    """
    # initialize container for text
    text = ''
    
    # if relationships header exists we can extract CHILD header
    if 'Relationships' in result:
        
        # relationship maps to a list (iterate through to reveal a dictionary)
        # e.g. 'Relationships' : [{'Type' : 'CHILD', 'Ids': ['e2b3b12f-ebb7-4f6e-914f-97b315672530']}]
        for relationship in result['Relationships']:
            
            # if relationship type is CHILD we explore job-id (indicates good fit)
            if relationship['Type'] == 'CHILD':
                
                # iterate through Ids list
                for child_id in relationship['Ids']:
                    
                    # select corresponding CHILD_ID from block map, this is sub-dictionary
                    word = blocks_map[child_id]
                    
                    # if block type is a word then we append with a space
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                        
                    # if block type is a selection element (e.g. an option button/mark)
                    # note we treat these cases with an X to denote an optional field 
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] == 'SELECTED':
                            text += 'X '    
    
    # return string corresponding with word 
    return text

# OCR Wrapper Functions
**The scripts perform an OCR job from AWS Textract, and returning well formated data**

In [10]:
def trp2df(table:trp.Table) -> pd.DataFrame:
    """
    Function designed to convert a trp table into a dataframe Complexity -> O(n^2) approx.
    ------------------------------------------------------------------------------------------
    Input
        :param table: (type trp.Table)
            A trp table object parsed from a pdf using AWS Textract  
    
    Output
        :return: type pandas.DataFrame
            A DataFrame object that is constructed by deconstructed a Textract trp table
    """
    N = len(table.rows)               # number of rows in table
    M = len(table.rows[0].cells)      # number of columns in table
    arr = [0]*N                       # initialize matrix container
    
    # iterate through each row within the provided table
    for row in np.arange(N):
        
        # strip the text from the cell references to construct (N X M) matrix
        arr[row] = [table.rows[row].cells[col].text.strip() for col in np.arange(M)]    # move column-wise to get text
        
    return pd.DataFrame(arr)

In [11]:
def readTable(response:list) -> tuple:
    """
    Function to transform AWS Textract object to a dataframe, by searching for tables
     ------------------------------------------------------------------------------------------
    Input
        :param response: (type list)
            An AWS Textract response object corresponding to pages of a given document page 
    
    Output
        :return: type tuple
            A (3x1) tuple is returned, storing the concated dataframe at the first index, and the accompanying 
            trp page objects for where the balance sheet was determined to reside at the second index
    """
    
    catDF = []          # in the event multiple tables detected on one page (concat them)
    page_series = []    # keep track of page objects where balance sheet was flagged
    page_nums = []
    page_count = 0
    
    # format the Textract response type 
    doc = trp.Document(response)
    
    # iterate through document pages
    for page in doc.pages:
        
        # itterate through page tables
        for table in page.tables: 
            
            # convert trp-table into dataframe object
            df = trp2df(table)
            
            # remove columns that are completely empty
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
  
            # number of columns in dataframe
            n = df.columns.size
            
            # reset the column names (avoid the column names)
            df.columns = np.arange(n)
            
            ##############################################################
            #                           NOTES
            #          a good dataframe should have 2-3 columns
            #      anything more or less is a reading error we ignore
            ##############################################################
            
            # if the dataframe has more than 3 columns then we most likley have an issue in parsing
            if n > 3:
                return None
            
            elif n > 1:
                
                ##############################
                # Balance Sheet Assummptions
                ##############################
                
                # this is the column with all line items (e.g. Cash, Total Assets, Total Liabilites)
                lineIndex = df.columns[0]

                # check for the word "cash" in a string at the begining, ignoring case sensitivity (asset check)
                assetCheck = df[lineIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

                # check for the word "Liabilities" in a string at the end, ignoring case sensitivity (liability check)
                debtCheck1 = df[lineIndex].str.contains('Liabilities$|^Liabilities', regex=True, flags=re.IGNORECASE)
                debtCheck2 = df[lineIndex].str.contains('Liability$|^Liability', regex=True, flags=re.IGNORECASE)
                
                # check for the presence of $ sign, we assume the balance sheet items should have at least one $ sign
                # this check is used to avoid reading the table of contents, which was flagged in prior reads
                dollarCheck = df[df.columns[1]].str.contains('\$[^\]]+', regex=True, flags=re.IGNORECASE)
                
                ##############################
                # Balance Sheet Determination
                ##############################
                
                # check if the key words have been found 
                check1 = df[assetCheck | debtCheck1 | debtCheck2].empty      # check for line item terms
                check2 = df[dollarCheck == True].empty                       # check for presence of '$' sign  
                check3 = df[debtCheck1 == True].empty                        # debt check for Liabilities
                check4 = df[debtCheck2 == True].empty                        # debt check for Liability 
                
                # if either asset term or liability term is found, with a $ sign we append the dataframe
                if not check1 and not check2:
                    
                    # we append pages since asset and liablility tables are often seperate
                    # there is no loss of generality if asset and liability terms are in one table
                    catDF.append(df)                
                    
                    # we want to keep track of pages that have been deemed as balance sheet
                    if page not in page_series:
                        page_series.append(page)   # only append if page isn't already recorded
                        page_nums.append(page_count)
                        
                    if not check3 or not check4:
                        # if liability table was found on the first iteration we simply concat data frames 
                        return (pd.concat(catDF), page_series, page_nums)
                    
        page_count += 1
        

In [12]:
def readPNG(pages:list, png_path:str, bucket='ran-s3-systemic-risk') -> tuple:
    """
    Function to transform AWS Textract object to a dataframe, by searching for tables
     ------------------------------------------------------------------------------------------
    Input
        :param response: (type list)
            
    
    Output
        :return: type tuple
          
    """
    subfolder = png_path.split('/')[-2]      # subfolder where PNG files are stored
    
    # construct PNG directories with relevant pages
    textract_paths = [png_path + subfolder + '-p{}.png'.format(idx) for idx in pages]
    
    catDF = []          # in the event multiple pages we concat them
    
    # path iterates through each png image matching the page numbers found in PDFs
    for path in textract_paths:
        
        try:
            # temporary data frame object for balance sheet information
            res = runJob(bucket, path)
            
            # if Textract job did not fail we continue extraction
            if res[0]['JobStatus'] != 'FAILED':

                # format the Textract response type 
                doc = trp.Document(res)

                # iterate through document pages
                for page in doc.pages:
                    
                    # itterate through page tables
                    for table in page.tables: 

                        # convert trp-table into dataframe object
                        df = trp2df(table)
                        
                        # remove columns that are completely empty
                        empty_cols = [col for col in df.columns if (df[col] == '').all()]
                        df = df.drop(empty_cols, axis=1)

                        # number of columns in dataframe
                        n = df.columns.size

                        # reset the column names (avoid the column names)
                        df.columns = np.arange(n)
                        
                        ##############################################################
                        #                           NOTES
                        #          a good dataframe should have 2-3 columns
                        #      anything more or less is a reading error we ignore
                        ##############################################################

                        # if the dataframe has more than 3 columns then we most likley have an issue in parsing
                        if n > 3:
                            return None

                        elif n > 1:

                            ##############################
                            # Balance Sheet Assummptions
                            ##############################

                            # this is the column with all line items (e.g. Cash, Total Assets, Total Liabilites)
                            lineIndex = df.columns[0]

                            # check for the word "cash" in a string at the begining, ignoring case sensitivity 
                            assetCheck = df[lineIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

                            # check for the word "Liabilities" in a string at the end, ignoring case sensitivity 
                            debtCheck1 = df[lineIndex].str.contains('Liabilities$|^Liabilities', 
                                                                    regex=True, flags=re.IGNORECASE)
                            debtCheck2 = df[lineIndex].str.contains('Liability$|^Liability', 
                                                                    regex=True, flags=re.IGNORECASE)

                            # check for the presence of $ sign, we assume the balance sheet items should have 
                            # this check is used to avoid reading the table of contents, which was flagged in prior reads
                            dollarCheck = df[df.columns[1]].str.contains('\$[^\]]+', regex=True, flags=re.IGNORECASE)

                            ##############################
                            # Balance Sheet Determination
                            ##############################

                            # check if the key words have been found 
                            check1 = df[assetCheck | debtCheck1 | debtCheck2].empty      # check for line item terms
                            check2 = df[dollarCheck == True].empty                       # check for presence of '$' sign  
                            check3 = df[debtCheck1 == True].empty                        # debt check for Liabilities
                            check4 = df[debtCheck2 == True].empty                        # debt check for Liability 

                            # if either asset term or liability term is found, with a $ sign we append the dataframe
                            if not check1 and not check2:

                                # we append pages since asset and liablility tables are often seperate
                                # there is no loss of generality if asset and liability terms are in one table
                                catDF.append(df)                

                                if not check3 or not check4:
                                    # if liability table was found on the first iteration we simply concat data frames 
                                    return pd.concat(catDF)
        
        # broad exeption to catch Textract parsing errors
        except:
            pass
    
    # default return None
    return None

In [13]:
def readForm(doc_pages:list) -> dict:
    """
    Function to transform AWS Textract object to a dictionary, by searching for key value pairs
    ------------------------------------------------------------------------------------------
    Input
        :param doc_pages: (type list)
            TRP page(s) for a AWS Textract response object corresponding to pages of a given document page 
    
    Output
        :return: type dict
            A python dictionary that maps KEYS (line items) with VALUES (corresponding records) for broker
            dealers balance sheet (e.g. {'Cash and cash equivalents : $ 12,513})
    """
    
    # initializing dictionary maps for KEY and VALUE pairs
    key_map = {}
    value_map = {}
    block_map = {}

    # iterate through document pages
    for page in doc_pages:

        # itterate through page tables
        for block in page.blocks: 

            # store the block id in map to retrive information later
            block_id = block['Id']
            block_map[block_id] = block

            # if Key-value set has been seen we deconstruct each KEY and VALUE map
            if block['BlockType'] == "KEY_VALUE_SET":

                # if KEY is labeled as entity type then we found Key, else we found VALUE
                if 'KEY' in block['EntityTypes']:
                    key_map[block_id] = block
                else:
                    value_map[block_id] = block
    
    # convert block objects to text dictionary map
    return get_kv_relationship(key_map, value_map, block_map)

In [14]:
def readText(doc_pages:list) -> dict:
    """
    Function to transform AWS Textract object to a dictionary of text values and confidence 
    ------------------------------------------------------------------------------------------
    Input
        :param doc_pages: (type list)
            TRP page(s) for a AWS Textract response object corresponding to pages of a given document page
    
    Output
        :return: type dict
            A python dictionary that maps TEXT (line items) with corresponding confidence figures as reported
            by AWS Textract object (e.g. {'Cash and cash equivalents : 99.97891})
    """
    # initializing dictionary maps for text
    text_map = {}
    
    # iterate through document pages
    for page in doc_pages:
        
        # itterate through page tables
        for block in page.blocks: 
            
            # if our block type is a line, we map the line text and confidence
            if block['BlockType'] == "LINE":
                text_map[block['Text']] = block['Confidence']
    
    # return completed text to confidence map
    return text_map

## Extract Balance Sheet information

In [15]:
def textractParse(pdf_path:str, png_path:str, bucket:str) -> dict:
    """
    Function runs a Textract job and saves Balance Sheet information to .csv file in s3 folder 
    """
    errors = ''
    
    # temporary data frame object for balance sheet information
    res = runJob(bucket, pdf_path)
    
    # if Textract job did not fail we continue extraction
    if res[0]['JobStatus'] != 'FAILED':

        # perform OCR and return balance sheet with corresponding page object(s)
        tb_response = readTable(res)           
        
        # checks for type of return, if none then we log an error
        if type(tb_response) == tuple:
            
            # deconstruct the table response tuple into dataframe and page object parts
            df1, page_obj, page_num = tb_response
            print('\nPage number(s) for extraction in PNG are {}\n'.format(page_num))
            
            # try to extract from a PNG (we can still return a None here)
            df2 = readPNG(page_num, png_path)
            
            # provided balance sheet page number we select FORM and TEXT data
            forms_data = readForm(page_obj)      
            text_data = readText(page_obj)        
            
            print('\nTextract-PDF dataframe')
            print(df1)
            
            print('\nTextract-PNG dataframe')
            print(df2)
            
            return (df1, df2, forms_data, text_data, None)
        else:
            error = 'No Balance Sheet found, or parsing error'
            return (None, None, None, None, error)
    else:
        error = 'Could not parse, JOB FAILED'
        return (None, None, None, None, error)

## Main File Execution

In [17]:
if __name__ == "__main__":

    # Amazon Textract client and Sagemaker session
    textract = boto3.client('textract')
    s3 = boto3.client('s3')
    session = Session()
    
    # initiate s3 bucket and corresponding data/output folder
    bucket = 'ran-s3-systemic-risk'
    
    data_png_folder = 'Input/X-17A-5-PNG-SUBSETS/'
    data_pdf_folder = 'Input/X-17A-5-PDF-SUBSETS/'
    
    output_png_folder = 'Output/X-17A-5-PNG-RAW/'
    output_pdf_folder = 'Output/X-17A-5-PDF-RAW/'
    
    temp_folder = 'Temp/'
    
    # csv directory where we store balance sheet information 
    output_png_csvs = np.array(session.list_s3_files(bucket, output_png_folder))
    output_pdf_csvs = np.array(session.list_s3_files(bucket, output_pdf_folder))
    
    # temp directory where JSON files is stored
    temp = np.array(session.list_s3_files(bucket, temp_folder))
    
    # pdf directory where we store the broker-dealer information 
    pdf_files = np.array(session.list_s3_files(bucket, data_pdf_folder))[1:]
    png_files = np.array(session.list_s3_files(bucket, data_png_folder))[1:]
    png_file_directory = list(set((map(lambda x: '/'.join(x.split('/')[:-1]), png_files))))
    
    # ===========================================================================
    # Load in Temp JSON files if present (FORM, TEXT, ERROR)
    # ===========================================================================
    
    if 'Temp/X17A5-FORMS.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/X17A5-FORMS.json', 'temp1.json')
        
        # read data on KEY-VALUE dictionary (i.e Textract FORMS) 
        with open('temp1.json', 'r') as f: forms_dictionary = json.loads(f.read())
        
        # remove local files for JSON
        os.remove('temp1.json')
    else:
        forms_dictionary = {}
        
    if 'Temp/X17A5-TEXT.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/X17A5-TEXT.json', 'temp2.json')
        
        # read data on TEXT-Confidence dictionary
        with open('temp2.json', 'r') as f: text_dictionary = json.loads(f.read())  
            
        # remove local files for JSON
        os.remove('temp2.json')
    else:
        text_dictionary = {}
        
    if 'Temp/ERROR-TEXTRACT.json' in temp:
        # retrieving downloaded files from s3 bucket
        s3.download_file(bucket, 'Temp/ERROR-TEXTRACT.json', 'temp3.json')
        
        # read data on errors derived from Textract
        with open('temp3.json', 'r') as f: error_dictionary = json.loads(f.read()) 
            
        # remove local files for JSON
        os.remove('temp3.json')
    else:
        error_dictionary = {}
    
    # ===========================================================================
    # Perform Textract analysis on PDFs and PNGs
    # ===========================================================================
    
    # e.g. ['Input/X-17A-5-PDF-SUBSETS/42352-2012-02-29-subset.pdf'] otherwise pdf_files (full sample)
    select_sample = ['Input/X-17A-5-PDF-SUBSETS/1146184-2010-02-25-subset.pdf',
                     'Input/X-17A-5-PDF-SUBSETS/1616344-2021-02-25-subset.pdf',
                     'Input/X-17A-5-PDF-SUBSETS/1146184-2011-02-25-subset.pdf',
                     'Input/X-17A-5-PDF-SUBSETS/1591458-2021-02-26-subset.pdf']

    for pdf_paths in pdf_files:
        
        # baseFile name to name export .csv file e.g. 1224385-2004-03-01.csv
        basefile = pdf_paths.split('/')[-1].split('-subset')[0]
        fileName = basefile + '.csv'
        print('\nPerforming OCR for {}'.format(fileName))
        
        # if file is not found in directory we extract the balance sheet
        # WE LOOK TO AVOID RE-RUNNING OLD TEXTRACT PARSES TO SAVE TIME
        if (output_pdf_folder + fileName not in output_pdf_csvs) or True:
            
            # run Textract OCR job and extract the parsed data 
            png_paths = data_png_folder + basefile + '/'
            df1, df2, forms_data, text_data, error = textractParse(pdf_paths, png_paths, bucket)

            # if no error is reported we save FORMS, TEXT, DataFrame
            if error is None:

                # store accompanying information for JSONs
                forms_dictionary[basefile] = forms_data
                text_dictionary[basefile]  = text_data
                print(text_data)
                
                # writing data frame to .csv file
                df1.to_csv(fileName, index=False)

                # save contents to AWS S3 bucket
                with open(fileName, 'rb') as data:
                    s3.put_object(Bucket=bucket, Key=output_pdf_folder + fileName, Body=data)
                
                # writing data frame to .csv file extracted from PNG
                if df2 is not None:
                    df2.to_csv(fileName, index=False)
                    
                    with open(fileName, 'rb') as data:
                        s3.put_object(Bucket=bucket, Key=output_png_folder + fileName, Body=data)
    
                # remove local file after it has been created
                os.remove(fileName)

                print('-----------------------------------------------------')
                print('Saved {} file to s3 bucket'.format(fileName))
            
            else:
                error_dictionary[basefile] = error
                
        else:
            print('{} has been downloaded'.format(fileName))
    
    # ===========================================================================
    # Save JSON files for updated figures (FORM, TEXT, ERROR)
    # ===========================================================================
    
    # write to a JSON file for FORMS 
    with open('X17A5-FORMS.json', 'w') as file: 
        json.dump(forms_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('X17A5-FORMS.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/X17A5-FORMS.json')
    
    # ---------------------------------------------------------------------------
    
    # write to a JSON file for TEXT 
    with open('X17A5-TEXT.json', 'w') as file: 
        json.dump(text_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('X17A5-TEXT.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/X17A5-TEXT.json')
    
    # ---------------------------------------------------------------------------
    
    # write to a JSON file for FORMS 
    with open('ERROR-TEXTRACT.json', 'w') as file: 
        json.dump(error_dictionary, file)
        file.close()
    
    # save contents to AWS S3 bucket
    with open('ERROR-TEXTRACT.json', 'rb') as data: 
        s3.upload_fileobj(data, bucket, 'Temp/ERROR-TEXTRACT.json')
    
    # remove local files for JSON
    os.remove('X17A5-FORMS.json')
    os.remove('X17A5-TEXT.json')
    os.remove('ERROR-TEXTRACT.json')
        


Performing OCR for 1146184-2010-02-25.csv
Started job with id: 5c348ad63afc3c8a54d1bc633880a612743edf0b43168a7b965eb6ee3be2983c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [4]

Started job with id: 77b42522c6726e17e4cbd2f90f41817498b17adb41f421298df94cca2a3cad20
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                                 Securities borro

{'CITADEL CLEARING LLC': 99.774169921875, 'Statement of Financial Condition': 99.77372741699219, '(Expressed in U.S. dollars in thousands)': 99.93649291992188, 'ASSETS': 99.43419647216797, 'As of December 31, 2020': 99.84294128417969, 'Assets:': 99.95867919921875, 'Cash': 99.85984802246094, '$': 99.85413360595703, '634,908': 99.36953735351562, 'Cash segregated under federal regulation': 99.96155548095703, '27,100': 99.35118103027344, 'Collateral held under securities loan agreements': 99.89599609375, '8,021,562': 99.48047637939453, 'Securities borrowed': 99.8678207397461, '6,768,574': 99.2337417602539, 'Receivable from clearing organizations': 99.95647430419922, '91,668': 99.58922576904297, 'Other assets': 99.96562957763672, '9,966': 99.79956817626953, 'Receivable from affiliated customer': 99.9398422241211, '2,151': 99.7018814086914, 'Total assets': 99.92121887207031, '15,555,929': 99.23884582519531, "LIABILITIES AND MEMBER'S CAPITAL": 99.71047973632812, 'Liabilities:': 98.85961151123

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                 1
0                                                Cash      $ 24,015,210
1   Securities purchased under agreements to resel...    15,297,395,866
2                                 Securities borrowed        40,806,249
3   Receivables from brokers and clearing organiza...       126,718,377
4                     Securities owned, at fair value            33,769
5                                   Other receivables            33,946
6                       Prepaid expenses and deposits           135,795
7   Property and equipment, net of accumulated dep...           156,142
8                                        Total assets  $ 15,489,295,354
9                     Liabilities and Member's Equity          

-----------------------------------------------------
Saved 1261467-2006-03-01.csv file to s3 bucket

Performing OCR for 1215680-2016-06-16.csv
Started job with id: de6861c02e3154221cb994feaae1732fd0fbb10a1e9642a1d1eb63297c313e93
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Page number(s) for extraction in PNG are [0]

Started job with id: afa46975ad01c57d9e45d097782b9ea7df6b082a6505dead3a7d487b74d2d6fd
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                            

Started job with id: 970c83908de162a13723b2acb9ce01c276c6dfdbbbe21265ed0ac800703cef1b
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4

Page number(s) for extraction in PNG are [4]

Started job with id: 2dc2a15ad054d1fde00e9ed0c2923d36615a45c42553deaba65f49efee5a784c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                       0          1
0              Cash and cash equivalents  $ 224,907
1                       Prepaid expenses     43,135
2                               Deposits      1,732
3                           Total assets  $ 269,774
4         LIABILITIES AND MEMBERS EQUITY           
5                            LIABILITIES           
6                       Accounts payable   $ 1

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [3]

Started job with id: 0f85164eaf9a0400d942c7a70c4cf229d57b3e95f90628e913afe50a5fac0097
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                                                Cash   $ 5,485,459
1   Cash and investments segregated on deposit for...    10,809,251
2   Receivables from brokers, dealers, and clearin...     3,050,607
3                          Receivables from customers       813,069
4                          

-----------------------------------------------------
Saved 1215680-2019-03-01.csv file to s3 bucket

Performing OCR for 1146184-2012-02-28.csv
Started job with id: c1dc10da2f2b79b5e9d902f35a5a958b5826458d1081cd1394501fdda606c7f5
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8

Page number(s) for extraction in PNG are [2]

Started job with id: ee7b0f6342315ae30db879336667d352f7665ed250f8d4f76d5163740f00041d
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                                         

Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                1
0                                                Cash        $ 112,661
1   Receivable and deposits with clearing organiza...        6,492,440
2   Financial instruments owned, at market value (...      153,421,139
3     Securities purchased under agreements to resell    5,609,789,066
4                                   Fixed assets, net            3,244
5                                        Other assets          293,794
6                                        Total assets  $ 5,770,112,344
7                     Liabilities and Member's Equity                 
8                                         Liabilities                 
9      Securities sold under agreements to repurchase  $ 5,617,167,013
10                   Payable to clearing organization       10,785,477
11                                   Accrued expens

-----------------------------------------------------
Saved 1146184-2006-03-01.csv file to s3 bucket

Performing OCR for 1215680-2005-03-01.csv
Started job with id: 651f738133384f80370ca32a8f4c26fd45daa9b37487e33d5611f0969126dfb3
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4

Page number(s) for extraction in PNG are [5]

Started job with id: bf72e9e90d9742cfd8d05b67a0d4a2abb389c6f87b8d54a6652b6bbd44f86a48
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                1
0                           Cash and cash equivalents      $ 3,633,603
1     Securities purchased u

Started job with id: e180f124d4703fea87ae9f6b0ee2444076f7c96e5741e45c5238239e1c85e55c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Page number(s) for extraction in PNG are [4]

Started job with id: 19ac73bc1c6e4bfc92dedb7fed331f4d2c08103bdbdfdf60a0f1cfbfa2cb4f5d
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                    

Started job with id: 33831c0d90265612ea0cba28c9c803fa098d9cc3b2a46bbbd4f910043254e14e
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8

Page number(s) for extraction in PNG are [3]

Started job with id: 1c30079f51b0d770410f2a8aab6f8903faad84c222e2920b7f0dae46ad26efca
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0            1
0                                                Cash    $ 715,745


Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Page number(s) for extraction in PNG are [5]

Started job with id: 5b0d85c0a32928f90d1c7bc5511f22ce22d84ed43616490a36945fd598d0ec4f
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                 1
0                           Cash and cash equivalents       $ 4,052,271
1                       Reverse repurchase agreements    14,032,433,910
2   Receivables from brokers and clearing organiza...        67,950,301
3                       Prepaid expenses and deposits            29,713
4   Property and equipment, net of accumulated dep...            36,396
5                                        Total assets  $ 14,104,502,591
6                     Liabilities and Member's Equity                  
7                                         LIABILITIES                  
8                

-----------------------------------------------------
Saved 1146184-2004-03-01.csv file to s3 bucket

Performing OCR for 1261467-2014-03-04.csv
Started job with id: 82bb6735bbdb812725ca5f3007ea0f0b7bc38335b495830734ee6537c9d9db65
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [4]

Started job with id: 35cd0e04d30634b43a672865cac57e9ec1868dbe3051a77af53693d9b121a69b
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dat

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5

Page number(s) for extraction in PNG are [7]

Started job with id: b2c4aa94aa656979d377bd0134982adac10aec65b30a9465eb896886896b2ed6
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                1
0                                                Cash        $ 387,428
1   Receivable and deposits with clearing organiza...        1,450,477
2     Securities purchased under agreements to resell    2,025,027,514
3                

-----------------------------------------------------
Saved 1591458-2019-02-28.csv file to s3 bucket

Performing OCR for 1146184-2019-02-28.csv
Started job with id: 7615ad2fe1040d198ca43b0d27db57ab7093ce1c06435c2a2a189bbe4988cc89
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8
Resultset page recieved: 9

Page number(s) for extraction in PNG are [7]

Started job with id: 6d5ab69e7285b07ae16247e8422888e430c115d040a6e642e05df94a2af87c4d
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-P

-----------------------------------------------------
Saved 1261467-2005-03-08.csv file to s3 bucket

Performing OCR for 1215680-2020-03-02.csv
Started job with id: a7e3e1d9cb60a1b7426088fe8396e768aa1d1538cf3bfbc6d0a3a8998edb680c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5

Page number(s) for extraction in PNG are [0]

Started job with id: 5ac4ad1436f5b7cb523c555264a67416408f5b920b5dfa04c48dc87ea4878e95
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Re

-----------------------------------------------------
Saved 1591458-2020-02-28.csv file to s3 bucket

Performing OCR for 1146184-2009-03-02.csv
Started job with id: 74d93bbc5040b58e913b6ace67f4d62d27990cebcc568c11738580da94037ac2
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Page number(s) for extraction in PNG are [6]

Started job with id: 18d488218ce72b34f22dddf33b6fd1e3a4acd504693a241690e9bf0d75038fe9
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                       

-----------------------------------------------------
Saved 1261467-2011-03-01.csv file to s3 bucket

Performing OCR for 1146184-2008-02-29.csv
Started job with id: 5a5d4e3598d52faa9cd9d89cf98a5eb64660d9217f0e1a2d505a000f46334663
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [3]

Started job with id: 5cb0f391224dd0d946c9b5e49930f9d0ad5ce9c2b4417f175a983671da96db4a
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0            1
0                           Cash and cash equi

-----------------------------------------------------
Saved 1146184-2020-02-28.csv file to s3 bucket

Performing OCR for 1215680-2007-03-01.csv
Started job with id: 2e9f98d76dd7c978269913a1a1bcf8ce2aa2743b5d712a99063985639272cc7b
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4

Page number(s) for extraction in PNG are [5]

Started job with id: e486c74877c20a432526044bdd88119797722a761b0ff8efc29b4885f55ab6fc
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                  0                1
0                                              Cash        $ 744,210
1       

-----------------------------------------------------
Saved 1215680-2013-03-01.csv file to s3 bucket

Performing OCR for 1146184-2018-02-27.csv
Started job with id: c0bd3905fc2e5fbec091b2fadadce7389a7fd9c18d698b585c882819298eff7e
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8

Page number(s) for extraction in PNG are [7]

Started job with id: e709d57f329ce955cf792a8aa802d32bf2d061dfbd7558612fa1211b27cb8aa2
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_

-----------------------------------------------------
Saved 1146184-2021-02-25.csv file to s3 bucket

Performing OCR for 1215680-2004-03-01.csv
Started job with id: 441c7a3e037c65f71f0ff836a40fae8487f967c90214bdf862760025896f942b
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3

Page number(s) for extraction in PNG are [5]

Started job with id: 72efa81477a64e1e28428f7f632f3c2da64eba3637d4a5dfdf9e30a1c334b97d
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Text

-----------------------------------------------------
Saved 1215680-2015-02-27.csv file to s3 bucket

Performing OCR for 26617-2002-05-30.csv
Started job with id: 8721d985b1c1d53edc984106dd78735ae3c3afc3d2b460dad975969be0ab049a
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Page number(s) for extraction in PNG are [3]

Started job with id: 970d48fb1175a26e93ef529a021b1ce57777587cdf0ec925169becf0d057c2c8
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                              

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                                                Cash     $ 319,892
1            Cash segregated under federal regulation        27,100
2                                 Securities borrowed    27,369,795
3                     Securities owned, at fair value     9,494,325
4    Collateral held under securities loan agreements     8,019,006
5   Receivable from clearing organizations and cus...       137,085
6                 Receivable from brokers and dealers         4,295
7                           Receivable for order flow        24,947
8   Exchange memberships and trading rights (fair ...        15,519
9      

-----------------------------------------------------
Saved 1261467-2012-02-29.csv file to s3 bucket

Performing OCR for 1215680-2011-03-01.csv
Started job with id: 537980f3400f7ab878d48d69b2f3c2a512d930c739b7c8529282bbbd4e115fb3
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4

Page number(s) for extraction in PNG are [2]

Started job with id: e1cbe9bbb70b385d3f2d2111fc5e4bb0dd1e9e15af066a974c0d0896766dde8e
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                1
0                                                Cash        $ 859,762
1   Receivable and deposits with clearing organiza...       39,085,513
2   Financial instruments owned, at market value (...

Started job with id: 2144ed31e96113285f12e4815d36e5c0a4ef599986dfa1dd26e72314dadba5bb
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [7]

Started job with id: 9f761b818e3b23c1f88c138e4ee60f0d9fe4e7a8057743f3875fda690616662f
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                 

-----------------------------------------------------
Saved 1215680-2018-03-01.csv file to s3 bucket

Performing OCR for 1215680-2017-03-07.csv
Started job with id: e6f804630fdbf932d0c584fe403559427e12462d71c0746747ce1776d8d806f0
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5

Page number(s) for extraction in PNG are [0]

Started job with id: 986c2b3e42bb216a3b9d346c3eac04b8ae5bc549ebe0599397e6cca74e4235cf
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                 1
0                                              

-----------------------------------------------------
Saved 1215680-2021-03-02.csv file to s3 bucket

Performing OCR for 1261467-2017-03-01.csv
Started job with id: a3fb42c8750e722b8a82054da7abb6cb8ae197db0e85f52ec377ba19bae5833e
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [3]

Started job with id: ace25748b81ce59fdacf9b8d03e660f5edb66cb987a7e40b188b06a25f510c28
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PRO

Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Page number(s) for extraction in PNG are [3]

Started job with id: ac587208169e24f5461e1ec47c098d32b832eb80cced116910bae0e3f27051c2
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0            Cash segregated under federal regulation        28,100
1                                 Securities borrowed    29,221,122
2    Collateral held under securities loan agreements     3,733,129
3   Receivable from brokers, dealers, and clearing...       131,461
4                Receivable from affiliated customers         1,750
5                                        Other assets         1,357
6                                        Total assets  $ 33,570,642
7                    LIABILITIES AND MEMBER'S CAPITAL              
8                                        Liabilities:

-----------------------------------------------------
Saved 1215680-2006-03-01.csv file to s3 bucket

Performing OCR for 1215680-2012-02-29.csv
Started job with id: 993c1d406eedb8c2f27b7a1592c3d00f39927a2964aa56913b1b4dd493f345d3
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4

Page number(s) for extraction in PNG are [3]

Started job with id: 57c466c09388b914fe05239ba040592311128fe279abd755aeeb3deef6eb3703
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0                   1
0                                                Cash         $ 7,509,614
1   Receivable and deposits with clearing organiza...         245,158,555
2   Financial instruments owned, at market value (...          10,249,620
3     Securities p

-----------------------------------------------------
Saved 1146184-2017-02-24.csv file to s3 bucket

Performing OCR for 1261467-2015-03-02.csv
Started job with id: 59142cfc63881598d9cb4b457f661bae70d32d873557fa9d419c8dcde997499d
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [4]

Started job with id: 09950483b79bfd4ffdb0eab554f4be781a708f433a75987192bfc5ba11e10e79
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PRO

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
Resultset page recieved: 8

Page number(s) for extraction in PNG are [0]

Started job with id: 301eb5f997dd4ce7b9306ab8c31d1e521785d24be8a15d7b4bb33e4e1c11db21
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0              1
0                                                Cash      $ 767,340
1             Funds segregated for regulatory purpose     10,596,292
2   Receivables from brokers, dealers, and clearin...      1,411,153
3                          Receivables from customers      1,525,314
4                          Collateralized agreements:              

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0            1  \
0                                                Cash                
1             Cash segregated for regulatory purposes                
2   Receivables from brokers, dealers and clearing...                
3                          Receivables from customers                
4                      Receivables from non-customers                
5     Securities purchased under agreements to resell                
6                                 Securities borrowed                
7                     Securities owned, at fair value  $ 1,317,438   
8   Securities owned and pledged as collateral, at...    1,632,947   
9               Total securities ow

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                                                Cash   $ 5,169,774
1            Funds segregated for regulatory purposes    10,880,721
2   Receivables from brokers, dealers, and clearin...     2,697,745
3                          Receivables from customers     2,266,857
4                          Collateralized agreements:              
5      Securities purchased under agreement to resell       832,926
6                                 Securities borrowed     6,118,764
7   Securities received as collateral - at fair value     3,245,520
8         Financial instruments owned - at fair value     4,650,220
9   Financial instruments owned and pledged as col...       894,627
10  Total financial instruments owned - at fair value     5,544,847
11  Fixed Assets (net of accumulated depreciation ...    69 251

-----------------------------------------------------
Saved 1215680-2009-03-04.csv file to s3 bucket

Performing OCR for 26617-2004-05-27.csv
Started job with id: 8db7b2a369348ca6d56f06759304ecd6dd90a1e703a4700c19de14b898c83124
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [4]

Started job with id: 043457b1a427b037e5669fb23c1f78a537fa60de349ba5302bad8b7fea288ba4
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGR

-----------------------------------------------------
Saved 26617-2004-05-27.csv file to s3 bucket

Performing OCR for 1616344-2020-02-27.csv
Started job with id: e9f4ee541ea31a69a42c77d49b9c7a1b5d8c6aa3db2180da64d7ef233a01ba27
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [7]

Started job with id: d3957b2b1d07b5414f9bd06b4393f412f90ab63ebd9ba356606b8dae0f307a9f
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0             1
0                                                Cash     $ 701,467
1                                                                  
2      

-----------------------------------------------------
Saved 1261467-2018-04-13.csv file to s3 bucket

Performing OCR for 1261467-2008-02-29.csv
Started job with id: cdd51368f3ea875adf4756948605d0618c2df7e526f2b30a60280e6c314cc221
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6

Page number(s) for extraction in PNG are [5]

Started job with id: 55d9eedf72682acd788bdefe0d3f330dbd6d67d1f1f0d223908de872f63d0a

-----------------------------------------------------
Saved 1261467-2008-02-29.csv file to s3 bucket

Performing OCR for 1146184-2016-02-24.csv
Started job with id: 5d890f1884c8bba35d6bf7ba62b3f2131b1fbea826a0ca84f7766a4d1e1032a8
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [3]

Started job with id: 7a985912acd87a63a077eb4130cc20c29117b0e4fc18620d4324549a30b69ee9
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                                                    0            1
0                                             

-----------------------------------------------------
Saved 1215680-2014-02-28.csv file to s3 bucket

Performing OCR for 1616344-2019-02-28.csv
Started job with id: 286d83cd443ddd39d6d0d84540243ea1a98f6cda4489c1fdefff2086b8702d8c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7

Page number(s) for extraction in PNG are [8]

Started job with id: 9a8ddd806dee38f6dce1ceb38df4f2fe43e1d202ae1dc40b6ece5ac3348991e2
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1

Textract-PDF dataframe
                 

In [18]:
# # single reading for testing purposes and debugging Textract results e.g. 853784-2002-03-01
# textractParse('Input/X-17A-5-PDF-SUBSETS/1146184-2004-03-01-subset.pdf', 
#               'Input/X-17A-5-PNG-SUBSETS/1146184-2004-03-01/', 'ran-s3-systemic-risk')