In [55]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp
pip install jupyterthemes

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (20.2.4)


In [56]:
# create a darker background template (better for my eyes)
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('chesterish')

In [57]:
import time 
import re
import os
import trp
import random 
import boto3
import minecart
import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session
from io import BytesIO

%matplotlib inline

In [58]:
# initiate s3 bucket and corresponding data folder
# bucket = "ran-s3-systemic-risk"
# data_folder ="Input/X-17A-5/"

bucket = "ran-s3-systemic-risk"
data_folder ="FOCUS-OCR/SubsetTest/"

# balance_sheet_idx = 3  # the page of the report you care about

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

In [59]:
# testKey = np.random.choice(subsetFolder, 1)
testKey = ['Input/SubSets/932233-02_subset.pdf']

# AWS Asynchronous Textract Script (requesting Job)
**Content modified from Amazon AWS Textract repository (refer to [URL](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py) below)** 

In [60]:
def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    response = None
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': objectName
            }
        },
        FeatureTypes=['TABLES']
    )
    
    # return response job ID for service
    return response["JobId"]

In [61]:
def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when qued
    """
    time.sleep(5)
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))
    
    # check current status of AWS job (ask server every 5 seconds for data)
    while(status == "IN_PROGRESS"):
        time.sleep(5)                   # lag before reporting status
        response = client.get_document_analysis(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))
    
    return status

In [62]:
def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after completion status met
    """
    pages = []          # initialize list object to track pages

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure
    while(nextToken):
        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [63]:
def runJob(bucket:str, key:str):
    """
    Function designed to call an AWS Textract 
    """
    # S3 storage for files on AWS site   
    jobId = startJob(bucket, key)   # intialize Textract job 
    print("Started job with id: {}".format(jobId))

    # if job is complete from AWS return response object 
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
        
    return response

# OCR Wrapper Functions
**The scripts perform an OCR job from AWS Textract, converting tabular data into dataframes**

In [64]:
def trp2df(table:trp.Table) -> pd.DataFrame:
    """
    Function designed to convert a trp table into a dataframe
    :param table: a trp table object parsed from a pdf  
    :return: a DataFrame object housing a textracted trp table
    
    Complexity -> O(n^2)
    """
    N = len(table.rows)               # number of rows in table
    M = len(table.rows[0].cells)      # number of columns in table
    arr = [0]*N
    
    # itterate through each row within the provided table
    for row in range(N):
        
        # strip the text from the cell references to construct (N X M) matrix
        arr[row] = [table.rows[row].cells[col].text.strip() for col in range(M)]
        
    return pd.DataFrame(arr)

In [65]:
def readPDF(response:list) -> pd.DataFrame:
    """
    Function to transform AWS Textract object to a pdf
    :param response: AWS Textract response object
    """
    # check to see if balance sheet was reached
    bsCheck = 0
    
    # in the event multiple tables detected on one page (concat them)
    catDF = []
    
    # format the Textract response type 
    doc = trp.Document(response)
    
    # iterate through document pages
    for page in doc.pages:
        # itterate through page tables
        for table in page.tables: 
            # convert trp-table into dataframe object
            df = trp2df(table)
            
            # remove columns that are completly empty (column 0 = line items)
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
            
            # determine if a keyword is present within the DataFrame
            # check the presence of key-words ignoring case sensitvity
            colIndex = df.columns[0]
            assetCheck = df[colIndex].str.contains('cash', regex=True, 
                                                   flags=re.IGNORECASE)
            equityCheck = df[colIndex].str.contains('equity', regex=True, 
                                                    flags=re.IGNORECASE)
            debtCheck = df[colIndex].str.contains('liabilites', regex=True, 
                                                  flags=re.IGNORECASE)
            
            # We want to preserve only the balance sheet data
            check = df[assetCheck | equityCheck | debtCheck].empty
            
            # if figure matched exactly, we assume this is balance sheet 
            if not check:
                bsCheck += 1
                catDF.append(df)

    # if no balance sheet was delivered, then return None
    if bsCheck == 0:
        print("Balance sheet wasn't found for {}".format(key))
        return None
    

## Extract Balance Sheet information

In [52]:
subsetFolder = np.array(session.list_s3_files(bucket, 'Input/SubSets/'))[1:]

In [53]:
subsetFolder

array(['Input/SubSets/1000148-02_subset.pdf',
       'Input/SubSets/1000148-03_subset.pdf',
       'Input/SubSets/1000148-04_subset.pdf', ...,
       'Input/SubSets/99947-17_subset.pdf',
       'Input/SubSets/99947-18_subset.pdf',
       'Input/SubSets/99947-19_subset.pdf'], dtype='<U35')

In [66]:
# script to perform OCR (using Textract) for X-17A-5 subsets
outFolder = 'Output/BalanceSheet/'
csvDirectory = np.array(session.list_s3_files(bucket, outFolder))
errorLog = []

# iterate through X-17A-5 subsets stored in s3
for key in subsetFolder[:2]:
    # baseFile name (CIK)-{Year}
    baseFile = key.split('/')[-1].split('_')[0]
    fileName = baseFile+'.csv'
    
    # if file is not found in directory 
    if outFolder+fileName not in csvDirectory:

        # temporary data frame object for balance sheet information
        res = runJob("ran-s3-systemic-risk", key)
        
        if len(res) > 0:
            tempDF = readPDF(res)
            
            # checks for type of return 
            if type(tempDF) == pd.DataFrame:
                # writing data frame to .csv file
                tempDF.to_csv(fileName)

                # the folder used to export to
                xFolder = 'Output/BalanceSheet/'+fileName

                # save contents to AWS S3 bucket
                with open(fileName, 'rb') as data:
                    s3.put_object(Bucket=bucket, Key=xFolder, Body=data)

                # remove local file after it has been created
                os.remove(fileName)
                print('\tSaved {} file to s3 bucket'.format(baseFile+'.csv'))
            else:
                print('{} no Balance Sheet found'.format(baseFile))
                errorLog.append(baseFile)
        else:
            print('{} could not be parsed'.format(baseFile))
            errorLog.append(baseFile)
    else:
        print('{} has been downloaded'.format(fileName))

print('==========================\nX-17A-5 OCR is completed')

Started job with id: ec23c583bc4d0715949fc1c36cf09ca5f723fcff984e248d92bfadc609b7bc14
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
	Saved 1000148-02.csv file to s3 bucket
Started job with id: 44f2a20d6b62763c4e2d1676aaddadf35aafe0bd4dce795c9443e86d29c7537b
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
	Saved 1000148-03.csv file to s3 bucket
X-17A-5 OCR is completed


# Helper Functions

In [124]:
def cleanNumeric(value:str) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
    """
    arr = ""
    
    # if length of sequence non-empty, iterate to convert string 
    if len(value) > 0:
        for elm in value:
            # retains only numeric related elements
            if (elm == '.') or isfloat(elm):
                arr += elm 
            else:
                # preserves the comma seperator purpose (if present)
                if elm != ',':
                    arr += '0'
                    
    # if sequence empty, simply return zero
    else: arr += '0'   
    
    return float(arr)

In [13]:
def isfloat(value) -> bool:
    """
    This function checks whether an input can be cast to a float
    """
    try:
        int(value)
        return True
    except ValueError: 
        return False

In [151]:
# temporary data frame object for balance sheet information
res = runJob("ran-s3-systemic-risk", 'Input/SubSets/7470-04_subset.pdf')

Started job with id: 39c6ca69ad4347d128a932a6914c278f410da7d7085366b999d4868a33b75b1c
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4


In [164]:
def testCase(res):
    doc = trp.Document(res)
    assetStrike = 0
    catDF = []

    # iterate through document pages
    for page in doc.pages:
        # itterate through page tables
        for table in page.tables: 
            # convert trp-table into dataframe object
            df = trp2df(table)

            # remove columns that are completly empty (column 0 = line items)
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
            colIndex = df.columns[0]

            # determine if a keyword is present within the DataFrame ignoring case sensitvity
            assetCheck = df[colIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)
            debtCheck = df[colIndex].str.contains('Liabilities$', regex=True, flags=re.IGNORECASE)
            
            # check if the key words have been found 
            check1 = df[assetCheck | debtCheck].empty
#             print(df)
            
            # if figure matched exactly, we assume this is balance sheet 
            if not check1:
                print(df)
                
                if debtCheck[debtCheck == True].empty == False:  
                    return 0

In [165]:
testCase(res)

                                                    0                 1  \
0                                         Liabilities  A.I. Liabilities   
1                    24. Notes and mortgages payable:                     
2                                        A. Unsecured            $ 1210   
3                                          B. Secured         v 1211 25   
4   25. Liabilities subordinated to claims of gene...                     
5                             1. from outsiders $ 970                     
6       2. Includes equity subordination (15c3-1 (d))                     
7                                            of $ 980                     
8          B. Securities borrowings, at market value;                     
9                                from outsiders $ 990                     
10  C. Pursuant to secured demand note collateral ...                     
11                           1. from outsiders $ 1000                     
12  2. Includes equity su

0

In [114]:
'Input/SubSets/7470-04_subset.pdf'
'Input/SubSets/1002201-02_subset.pdf'
'Input/SubSets/1001362-11_subset.pdf'
'Input/SubSets/1000148-02_subset.pdf'

'Input/SubSets/1001362-11_subset.pdf'