In [3]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp
pip install jupyterthemes

Collecting pip
  Using cached pip-21.0-py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.3
    Uninstalling pip-20.3:
      Successfully uninstalled pip-20.3
Successfully installed pip-21.0
Collecting smart_open
  Downloading smart_open-4.1.2-py3-none-any.whl (111 kB)
Collecting minecart
  Downloading minecart-0.3.0-py3-none-any.whl (23 kB)
Collecting pdfminer3k
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
Installing collected packages: pdfminer3k, smart-open, minecart
Successfully installed minecart-0.3.0 pdfminer3k-1.3.4 smart-open-4.1.2
Collecting textract-trp
  Downloading textract_trp-0.1.3-py3-none-any.whl (5.8 kB)
Installing collected packages: textract-trp
Successfully installed textract-trp-0.1.3
Collecting jupyterthemes
  Downloading jupyterthemes-0.20.0-py2.py3-none-any.whl (7.0 MB)
Collecting lesscpy>=0.11.2
  Downloading lesscpy-0.14.0-py2.py3-none-any.whl (46 kB)
Installing collecte

In [13]:
import time 
import re
import os
import trp
import boto3
import minecart
import logging 

import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session

In [14]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5-Subsets/"

# script to perform OCR (using Textract) for X-17A-5 subsets
out_folder = 'Output/X-17A-5-BS/'

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# AWS Asynchronous Textract Script (requesting Job)
**Content modified from Amazon AWS Textract repository (refer to [URL](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py) below)** 

In [7]:
def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    response = None
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': objectName
            }
        },
        FeatureTypes=['TABLES']    # selecting tables from the OCR
    )
    
    # return response job ID for service
    return response["JobId"]

In [8]:
def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when qued
    """
    time.sleep(1)
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))
    
    # check current status of AWS job (ask server every 5 seconds for data)
    while(status == "IN_PROGRESS"):
        time.sleep(5)                   # lag before reporting status
        response = client.get_document_analysis(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))
    
    return status

In [9]:
def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after completion status met
    """
    pages = []          # initialize list object to track pages

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure
    while(nextToken):
        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [10]:
def runJob(bucket:str, key:str):
    """
    Function designed to call an AWS Textract 
    """
    # S3 storage for files on AWS site   
    jobId = startJob(bucket, key)   # intialize Textract job 
    print("Started job with id: {}".format(jobId))

    # if job is complete from AWS return response object 
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
        
    return response

# OCR Wrapper Functions
**The scripts perform an OCR job from AWS Textract, converting tabular data into dataframes**

In [15]:
def trp2df(table:trp.Table) -> pd.DataFrame:
    """
    Function designed to convert a trp table into a dataframe
    :param table: a trp table object parsed from a pdf  
    :return: a DataFrame object housing a textracted trp table
    
    Complexity -> O(n^2) approx.
    """
    N = len(table.rows)               # number of rows in table
    M = len(table.rows[0].cells)      # number of columns in table
    arr = [0]*N
    
    # iterate through each row within the provided table
    for row in np.arange(N):
        
        # strip the text from the cell references to construct (N X M) matrix
        arr[row] = [table.rows[row].cells[col].text.strip() for col in np.arange(M)]
        
    return pd.DataFrame(arr)

In [16]:
def readPDF(response:list) -> pd.DataFrame:
    """
    Function to transform AWS Textract object to a pdf
    :param response: AWS Textract response object
    """
    # in the event multiple tables detected on one page (concat them)
    catDF = []
    
    # format the Textract response type 
    doc = trp.Document(response)
    
    # iterate through document pages
    for page in doc.pages:
        
        # itterate through page tables
        for table in page.tables: 
            
            # convert trp-table into dataframe object
            df = trp2df(table)
            
            # remove columns that are completly empty (column 0 = line items)
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
            
            # check to make sure dataframe is not empty  
            if df.values.size > 0:
                colIndex = df.columns[0]

                # check for the word "cash" in a string at the begining, ignoring case sensitivity (asset check)
                assetCheck = df[colIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)

                # check for the word "Liabilities" in a string at the end, ignoring case sensitivity (liability check)
                debtCheck1 = df[colIndex].str.contains('Liabilities$|^Liabilities', 
                                                      regex=True, flags=re.IGNORECASE)
                debtCheck2 = df[colIndex].str.contains('Liability$|^Liability', 
                                                      regex=True, flags=re.IGNORECASE)

                # check if the key words have been found 
                check1 = df[assetCheck | debtCheck1 | debtCheck2].empty
                check2 = debtCheck1[debtCheck1 == True].empty
                check3 = debtCheck2[debtCheck2 == True].empty

                # if either asset term or liability term is found we append the dataframe
                if not check1:
                    catDF.append(df)
                    
                    # if liabilites are found (hence empty boolean is False) we stop iteration
                    # this is to prevent split tables that hold assets seperate from liabilites
                    if check2 == False or check3 == False:  
                        return pd.concat(catDF)
                

## Extract Balance Sheet information

In [21]:
def textractParse(path:str, index:int, csvDirectory:np.ndarray, bucket:str = "ran-s3-systemic-risk", 
                  out_folder:str = 'Output/X-17A-5-BS/'):
    """
    Function runs a Textract job and saves Balance Sheet information to .csv file in s3 folder 
    """
    
    # baseFile name to name export .csv file e.g. 782124-2002.csv
    baseFile = '-'.join(path.split('/')[-1].split('-')[:2])
    fileName = baseFile + '.csv'
    print('\nPerforming OCR for {}'.format(baseFile))

    # if file is not found in directory we continue the iteration process
    if out_folder + fileName not in csvDirectory:

        # temporary data frame object for balance sheet information
        res = runJob("ran-s3-systemic-risk", path)
        
        # if Textract job did not fail we continue extraction
        if res[0]['JobStatus'] != 'FAILED':
            tempDF = readPDF(res)
            print(tempDF)
            
            # checks for type of return, if none then we log an error
            if type(tempDF) == pd.DataFrame:
                
                # writing data frame to .csv file
                tempDF.to_csv(fileName, index=False)

                # save contents to AWS S3 bucket
                with open(fileName, 'rb') as data:
                    s3.put_object(Bucket=bucket, Key=out_folder + fileName, Body=data)

                # remove local file after it has been created
                os.remove(fileName)
                
                print('-----------------------------------------------------')
                print('Saved {} file to s3 bucket'.format(baseFile + '.csv'))
            else:
                print('No Balance Sheet found in {}'.format(baseFile))
        else:
            print('Could not parse {}, JOB FAILED'.format(baseFile))
    else:
        print('{} has been downloaded'.format(fileName))
        
    return res

In [22]:
# csv Directory to store balance sheet information 
csvs = np.array(session.list_s3_files(bucket, out_folder))

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))[1:]

# iterate through X-17A-5 subsets stored in s3 
for i, key in enumerate(paths[0:1]):     
    responseVal = textractParse(key, i, csvs)


Performing OCR for 782124-2002
Started job with id: 8fed04d011c8245e30d4ae583c26a9b2b85edfe74bd15ac4d1dddebbb29d7b96
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6
Resultset page recieved: 7
                                                    0              1
0                                              ASSETS               
1                           Cash and cash equivalents      $ 222,336
2   Cash and securities deposited with clearing or...   