In [1]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp

Collecting pip
  Using cached pip-21.0.1-py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.3.3
    Uninstalling pip-20.3.3:
      Successfully uninstalled pip-20.3.3
Successfully installed pip-21.0.1
Collecting smart_open
  Downloading smart_open-4.1.2-py3-none-any.whl (111 kB)
Collecting minecart
  Downloading minecart-0.3.0-py3-none-any.whl (23 kB)
Collecting pdfminer3k
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
Installing collected packages: pdfminer3k, smart-open, minecart
Successfully installed minecart-0.3.0 pdfminer3k-1.3.4 smart-open-4.1.2
Collecting textract-trp
  Downloading textract_trp-0.1.3-py3-none-any.whl (5.8 kB)
Installing collected packages: textract-trp
Successfully installed textract-trp-0.1.3


In [2]:
import time 
import re
import os
import trp
import boto3
import minecart
import logging 

import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session

In [3]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5-Subsets/"

# script to perform OCR (using Textract) for X-17A-5 subsets
out_folder = 'Output/X-17A-5-BS/'

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# AWS Asynchronous Textract Script (requesting Job)
**Content modified from Amazon AWS Textract repository (refer to [URL](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py) below)** 

In [4]:
def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    response = None
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': objectName
            }
        },
        FeatureTypes=['TABLES']    # selecting tables from the OCR
    )
    
    # return response job ID for service
    return response["JobId"]

In [5]:
def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when qued
    """
    time.sleep(1)
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))
    
    # check current status of AWS job (ask server every 5 seconds for data)
    while(status == "IN_PROGRESS"):
        time.sleep(5)                   # lag before reporting status
        response = client.get_document_analysis(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))
    
    return status

In [6]:
def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after completion status met
    """
    pages = []          # initialize list object to track pages

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure
    while(nextToken):
        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [7]:
def runJob(bucket:str, key:str):
    """
    Function designed to call an AWS Textract 
    """
    # S3 storage for files on AWS site   
    jobId = startJob(bucket, key)   # intialize Textract job 
    print("Started job with id: {}".format(jobId))

    # if job is complete from AWS return response object 
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
        
    return response

# OCR Wrapper Functions
**The scripts perform an OCR job from AWS Textract, converting tabular data into dataframes**

In [8]:
def trp2df(table:trp.Table) -> pd.DataFrame:
    """
    Function designed to convert a trp table into a dataframe
    :param table: a trp table object parsed from a pdf  
    :return: a DataFrame object housing a textracted trp table
    
    Complexity -> O(n^2) approx.
    """
    N = len(table.rows)               # number of rows in table
    M = len(table.rows[0].cells)      # number of columns in table
    arr = [0] * N
    
    # iterate through each row within the provided table
    for row in np.arange(N):
        
        # strip the text from the cell references to construct (N X M) matrix
        arr[row] = [table.rows[row].cells[col].text.strip() for col in np.arange(M)]
        
    return pd.DataFrame(arr)

In [18]:
def readPDF(response:list) -> pd.DataFrame:
    """
    Function to transform AWS Textract object to a pdf
    :param response: AWS Textract response object
    """
    # in the event multiple tables detected on one page (concat them)
    catDF = []
    
    # format the Textract response type 
    doc = trp.Document(response)
    
    # iterate through document pages
    for i, page in enumerate(doc.pages):
        
        # itterate through page tables
        for j, table in enumerate(page.tables): 
            
            # convert trp-table into dataframe object
            df = trp2df(table)
            
            # remove columns that are completly empty
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
            
            # number of columns in dataframe
            n = df.columns.size
            
            # reset the column names (avoid the column names)
            df.columns = np.arange(n)
            print(df)
            ##############################################################
            #                           NOTES
            #          a good dataframe should have 2-3 columns
            #      anything more or less is a reading error we ignore
            ##############################################################
            
            # if the dataframe has more than 3 columns then we most likley have an issue in parsing
            if n > 3:
                return None
            
            elif n > 1:
                
                ##############################
                # Balance Sheet Assummptions
                ##############################
                
                lineIndex = df.columns[0]

                # check for the word "cash" in a string at the begining, ignoring case sensitivity (asset check)
                assetCheck = df[lineIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

                # check for the word "Liabilities" in a string at the end, ignoring case sensitivity (liability check)
                debtCheck1 = df[lineIndex].str.contains('Liabilities$|^Liabilities', regex=True, flags=re.IGNORECASE)
                debtCheck2 = df[lineIndex].str.contains('Liability$|^Liability', regex=True, flags=re.IGNORECASE)
                
                # check for the presence of $ sign, we assume the balance sheet items should have presence of $ sign
                dollarCheck = df[df.columns[1]].str.contains('\$[^\]]+', regex=True, flags=re.IGNORECASE)
                
                ##############################
                # Balance Sheet Determination
                ##############################
                
                # check if the key words have been found 
                check1 = df[assetCheck | debtCheck1 | debtCheck2].empty      # check for terms, and $ presence
                check2 = df[dollarCheck == True].empty                       # check for presence of '$' sign  
                check3 = df[debtCheck1 == True].empty                        # debt check for Liabilities
                check4 = df[debtCheck2 == True].empty                        # debt check for Liability 
                
                # if either asset term or liability term is found, with a $ sign we append the dataframe
                if not check1 and not check2:
                    catDF.append(df)      # we append since sometimes asset and liablility tables are seperated 
                    
                    if not check3 or not check4:
                        # if liability table was found on the first iteration we simply concat data frames and return 
                        return pd.concat(catDF)
                

In [11]:
# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))[1:]

In [12]:
# temporary data frame object for balance sheet information
res1 = runJob("ran-s3-systemic-risk", 'Input/X-17A-5-Subsets/91154-2003-subset.pdf')

Started job with id: 1044f06eee8a76ffb72b129e5474e5783b260f0681a750b286f8d89db7df5935
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7


In [19]:
readPDF(res1)

                  0                                                  1
0       SELECTED, X                                   (a) Facing page.
1                 X              (b) Statement of Financial Condition.
2         SELECTED,                           (c) Statement of Income.
3         SELECTED,                       (d) Statement of Cash Flows.
4         SELECTED,  (e) Statement of Changes in Stockholder's Equity.
5         SELECTED,  (f) Statement of Changes in Liabilities Subord...
6         SELECTED,  (g) Computation of Net Capital for Brokers and...
7       SELECTED, X  (h) Computation for Determination of Reserve R...
8         SELECTED,  (i) Information Relating to the Possession or ...
9                 -  (j) A Reconciliation, including appropriate ex...
10      SELECTED, X  (k) A Reconciliation between the audited and u...
11        SELECTED,                        (1) An Oath or Affirmation.
12                         (m) A copy of the SIPC Supplemental Report.
13  NO

Unnamed: 0,0,1,2
0,Assets:,,
1,Cash and cash equivalents,,$ 581
2,Cash and securities segregated and on deposit ...,,1998
3,Collateralized short-term financing agreements:,,
4,Securities purchased under agreements to resell,"$61,767",
5,Deposits paid for securities borrowed,42750,
6,,,104517
7,Financial instruments owned and contractual co...,,
8,(Approximately $16 billion were pledged to var...,,
9,U.S. government and government agency securities,35143,


In [107]:
# format the Textract response type 
doc = trp.Document(res1)

# iterate through document pages
for i, page in enumerate(doc.pages):
    # itterate through page tables
    for j, table in enumerate(page.tables): 
        print(i, j)
        # convert trp-table into dataframe object
        df = trp2df(table)
        
        # remove columns that are completly empty
        empty_cols = [col for col in df.columns if (df[col] == '').all()]
        df = df.drop(empty_cols, axis=1)

        # number of columns in dataframe
        n = df.columns.size
        
        # if the dataframe has more than 3 columns then we most likley have an issue in parsing
        if n > 3:
            print('Fail')

        elif n > 1:
            ##############################
            # Balance Sheet Assummptions
            ##############################

            lineIndex = df.columns[0]

            # check for the word "cash" in a string at the begining, ignoring case sensitivity (asset check)
            assetCheck = df[lineIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

            # check for the word "Liabilities" in a string at the end, ignoring case sensitivity (liability check)
            debtCheck1 = df[lineIndex].str.contains('Liabilities$|^Liabilities', regex=True, flags=re.IGNORECASE)
            debtCheck2 = df[lineIndex].str.contains('Liability$|^Liability', regex=True, flags=re.IGNORECASE)

            # check for the presence of $ sign, we assume the balance sheet items should have presence of $ sign
            dollarCheck = df[df.columns[1]].str.contains('\$[^\]]+', regex=True, flags=re.IGNORECASE)

            ##############################
            # Balance Sheet Determination
            ##############################

            # check if the key words have been found 
            check1 = df[assetCheck | debtCheck1 | debtCheck2].empty      # check for presence of any term 
            check2 = df[dollarCheck == True].empty                       # check for presence of '$' sign  
            check3 = df[debtCheck1 == True].empty                        # debt check for Liabilities
            check4 = df[debtCheck2 == True].empty                        # debt check for Liability
            
            print(check1, check2, check3, check4)
            # if either asset term or liability term is found, with a $ sign we append the dataframe
            if not check1 and not check2:
                print(df)      # we append since sometimes asset and liablility tables are seperated 
#                 print('Page {}, Table {}'.format(i, j))
#                 if not check2 or not check3:
#                     # if liability table was found on the first iteration we simply concat data frames and return 
#                     return pd.concat(catDF)

0 0
Fail
1 0
True True True True
2 0
True True True True
3 0
False False True True
                                                    0          1
0                           Cash and cash equivalents   $ 12,704
1   Cash and securities segregated for regulatory ...     69,669
2   Receivables from brokers, dealers and clearing...      9,842
3       Receivables from customers and counterparties     15,780
4                          Collateralized agreements:           
5   Securities borrowed (includes $61,182 at fair ...    209,997
6   Financial instruments purchased under agreemen...     55,958
7          Financial instruments owned, at fair value     76,822
8   Financial instruments owned and pledged as col...     20,122
9    Total financial instruments owned, at fair value     96,944
10                                                              
11                                       Other assets      5,596
12                                       Total assets  $ 476,490
3 1
Fal