In [37]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp

Requirement already up-to-date: pip in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (20.2.4)


In [76]:
import time 
import os
import trp
import random 
import boto3
import minecart
import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session
from io import BytesIO

%matplotlib inline

In [119]:
# initiate s3 bucket and corresponding data folder
# bucket = "ran-s3-systemic-risk"
# data_folder ="Input/X-17A-5/"

bucket = "ran-s3-systemic-risk"
data_folder ="FOCUS-OCR/SubsetTest/"

# balance_sheet_idx = 3  # the page of the report you care about

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))
subset = 1
test_key = np.random.choice(paths, subset)

In [40]:
# random sample of files from s3
test_key

array(['FOCUS-OCR/SubsetTest/CREDIT SUISSE SECURITIES (USA) LLC03.pdf'],
      dtype='<U79')

In [41]:
# content modified from Amazon AWS Textract repository (refer to URL below)
# https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py

def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    response = None
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': objectName
            }
        },
        FeatureTypes=['TABLES']
    )
    
    # return response job ID for service
    return response["JobId"]

def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when qued
    """
    time.sleep(5)
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))
    
    # check current status of AWS job (ask server every 5 seconds for data)
    while(status == "IN_PROGRESS"):
        time.sleep(5)                   # lag before reporting status
        response = client.get_document_analysis(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))
    print('Final status is {}'.format(status))
    return status

def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after completion status met
    """
    pages = []          # initialize list object to track pages

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
   
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure
    while(nextToken):

        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages


In [7]:
# Start Textract job to read PDF
# S3 storage for files on AWS site   
for key in test_key:
    jobId = startJob(bucket, key)
    print("Started job with id: {}".format(jobId))
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)

Started job with id: cb9cccd88f569fab29f3276ef9ba667a65d42d855e2835d0594e2bf5de0d51a2
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1


In [9]:
doc = trp.Document(response)

In [42]:
def trp2df(table:trp.Table) -> pd.DataFrame:
    """
    Function designed to convert a trp table into a dataframe
    :param: a trp table object parsed from a pdf  
    :return: a DataFrame object housing a textracted trp table
    
    Complexity -> O(n^2)
    """
    N = len(table.rows)               # number of rows in table
    M = len(table.rows[0].cells)      # number of columns in table
    arr = [0]*N
    
    # itterate through each row within the provided table
    for row in range(N):
        # strip the text from the cell references to construct (N X M) matrix
        arr[row] = [table.rows[row].cells[col].text.strip() for col in range(M)]
        
    return pd.DataFrame(arr)

In [43]:
def readPDF(bucket:str, key:str) -> pd.DataFrame:
    """
    Function to call AWS textract and perform OCR
    """
    # S3 storage for files on AWS site   
    jobId = startJob(bucket, key)   # intialize Textract job 
    print("Started job with id: {}".format(jobId))

    # if job is complete from AWS return response object 
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
    
    # check to see if balance sheet was reached
    bsCheck = 0
    
    try:
        # format the Textract response type 
        doc = trp.Document(response)

        # iterate through document pages
        for page in doc.pages:
            # itterate through page tables
            for table in page.tables: 
                # convert trp-table into dataframe object
                df = trp2df(table)

                # determine if a keyword is present within the DataFrame
                # We want to preserve only the balance sheet data
                check = df[(df[df.columns[0]] == 'cash') | (df[df.columns[0]] == 'cash ') |
                           (df[df.columns[0]] == 'Cash') | (df[df.columns[0]] == 'Cash ') |
                           (df[df.columns[0]] == 'CASH') | (df[df.columns[0]] == 'CASH ') |
                           (df[df.columns[0]] == 'Cash and cash equivalents') | 
                           (df[df.columns[0]] == 'Cash and cash equivalents ')].empty

                # if figure matched exactly, we assume this is balance sheet 
                if not check:
                    bsCheck += 1
                    return df
    except:
        print('Process Failed')
    
    if bsCheck == 0:
        print("Balance sheet wasn't found for {}".format(key))
        return None
    

In [44]:
def cleanNumeric(value:str) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
    """
    arr = ""
    
    # nested function for float casting type check
    def isfloat(value) -> bool:
        try:
            int(value)
            return True
        except ValueError: return False
    
    # if length of sequence non-empty, iterate to convert string 
    if len(value) > 0:
        for elm in value:
            # retains only numeric related elements
            if (elm == '.') or isfloat(elm):
                arr += elm 
            else:
                # preserves the comma seperator purpose (if present)
                if elm != ',':
                    arr += '0'
                    
    # if sequence empty, simply return zero
    else: arr += '0'   
    
    return float(arr)

In [124]:
arr = []

# iterate through document pages
for page in doc.pages:
    # itterate through page tables
    for table in page.tables:    
        # itterate through each row within the provided table
        for row in table.rows:
            print(row)
            # strip the text from the cell references
            n = len(row.cells)
            arr.append([row.cells[i].text.strip() for i in range(n)])

[CASH ][][$ 7,166,101 ]
[MONEY MARKET FUND ][][123,000,000 ]
[RECEIVABLES: ][][]
[Customers ][$ 93,872,303 ][]
[Brokers, dealers and clearing organizations ][20,979,886 ][]
[Securities borrowed ][17,752,700 ][]
[Miscellaneous ][5,020,980 ][137,625,869 ]
[TRADING SECURITIES: ][][]
[U.S. Government and agency ][28,946,455 ][]
[Corporate debt ][22,490,530 ][]
[State and municipal obligations ][10,412,256 ][]
[Equity ][1,431,506 ][63,280,747 ]
[INVESTMENTS ][][14,838,726 ]
[SECURED DEMAND NOTES (collateral market value, $176,571,872) ][][58,627,500 ]
[OFFICE FURNISHINGS, EQUIPMENT AND LEASEHOLD IMPROVEMENTS, at cost less accumulated depreciation of $39,850,536 ][][26,934,931 ]
[OTHER ASSETS ][][5,938,718 ]
[][][$ 437,412,592 ]
[LIABILITIES AND PRINCIPAL ][CAPITAL ][]
[PAYABLES: ][][]
[Principals ][$ 202,639,066 ][]
[Customers ][51,425,532 ][]
[Brokers, dealers and clearing organizations ][16,745,789 ][]
[Miscellaneous ][23,525,701 ][$ 294,336,088 ]
[SECURITIES SOLD, not yet purchased ][][6

In [206]:
pd.DataFrame(np.array(arr)).head()

Unnamed: 0,0,1,2
0,CASH,,"$ 7,166,101"
1,MONEY MARKET FUND,,123000000
2,RECEIVABLES:,,
3,Customers,"$ 93,872,303",
4,"Brokers, dealers and clearing organizations",20979886,


## Asynchronous over Multiple PDFs

In [53]:
subsetFolder = np.array(session.list_s3_files(bucket, 'Input/SubSets/'))

In [54]:
subsetFolder

array(['Input/SubSets/', 'Input/SubSets/1000148-02_subset.pdf',
       'Input/SubSets/1000148-03_subset.pdf', ...,
       'Input/SubSets/99947-17_subset.pdf',
       'Input/SubSets/99947-18_subset.pdf',
       'Input/SubSets/99947-19_subset.pdf'], dtype='<U35')

In [45]:
# testKey = np.random.choice(subsetFolder, 1)
testKey = ['Input/SubSets/932233-02_subset.pdf']

In [46]:
testKey

['Input/SubSets/932233-02_subset.pdf']

In [120]:
csvDirectory = np.array(session.list_s3_files(bucket, key))

for key in testKey:
    # baseFile name (CIK)-{Year}
    baseFile = key.split('/')[-1].split('_')[0]
    fileName = baseFile+'.csv'
    
    # if file is not found in directory 
    if fileName not in csvDirectory:
        
        # temporary data frame object for balance sheet information
        tempDF = readPDF("ran-s3-systemic-risk", key)

        if (tempDF.empty != True) or (tempDF != None):
            # writing data frame to .csv file
            tempDF.to_csv(fileName)
            
            # the subfolder 
            subFolder = 'Output/BalanceSheet/'+fileName
            
            # save contents to AWS S3 bucket
            with open(fileName, 'rb') as data:
                client.put_object(Bucket=bucket, Key=k, Body=data)

            # remove local file after it has been created
            os.remove(fileName)
            print('\tSaved {} file to s3 bucket'.format(baseFile+'.csv'))
        else:
            print('{} had no balance sheet'.format(basefile))
            

Started job with id: 3b90f9f838db76c338dc9d518035e7417679425f9f705842bab8dbee60b80859
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Final status is SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
	Saved 932233-02.csv file to s3 bucket
