In [1]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp
pip install jupyterthemes

Collecting pip
  Using cached pip-20.2.4-py2.py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.0.2
    Uninstalling pip-20.0.2:
      Successfully uninstalled pip-20.0.2
Successfully installed pip-20.2.4
Collecting smart_open
  Downloading smart_open-3.0.0.tar.gz (113 kB)
Collecting minecart
  Downloading minecart-0.3.0-py3-none-any.whl (23 kB)
Collecting pdfminer3k
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py): started
  Building wheel for smart-open (setup.py): finished with status 'done'
  Created wheel for smart-open: filename=smart_open-3.0.0-py3-none-any.whl size=107097 sha256=d169f11e512a82618f98399b77761bc31f5dee03bec839514f33435c87cdcc10
  Stored in directory: /home/ec2-user/.cache/pip/wheels/88/2a/d4/f2e9023989d4d4b3574f268657cb6cd23994665a038803f547
Successfully built smart-open
Installing coll

In [2]:
# create a darker background template (better for my eyes)
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('chesterish')

In [3]:
import time 
import re
import os
import trp
import random 
import boto3
import minecart
import numpy as np
import pandas as pd

from smart_open import open
from sagemaker.session import Session
from io import BytesIO

%matplotlib inline

In [4]:
# initiate s3 bucket and corresponding data folder
# bucket = "ran-s3-systemic-risk"
# data_folder ="Input/X-17A-5/"

bucket = "ran-s3-systemic-risk"
data_folder ="FOCUS-OCR/SubsetTest/"

# balance_sheet_idx = 3  # the page of the report you care about

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
s3 = boto3.client('s3')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))

# AWS Asynchronous Textract Script (requesting Job)
**Content modified from Amazon AWS Textract repository (refer to [URL](https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py) below)** 

In [5]:
def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    response = None
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': objectName
            }
        },
        FeatureTypes=['TABLES']
    )
    
    # return response job ID for service
    return response["JobId"]

In [6]:
def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when qued
    """
    time.sleep(5)
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))
    
    # check current status of AWS job (ask server every 5 seconds for data)
    while(status == "IN_PROGRESS"):
        time.sleep(5)                   # lag before reporting status
        response = client.get_document_analysis(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))
    
    return status

In [7]:
def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after completion status met
    """
    pages = []          # initialize list object to track pages

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure
    while(nextToken):
        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)
        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [8]:
def runJob(bucket:str, key:str):
    """
    Function designed to call an AWS Textract 
    """
    # S3 storage for files on AWS site   
    jobId = startJob(bucket, key)   # intialize Textract job 
    print("Started job with id: {}".format(jobId))

    # if job is complete from AWS return response object 
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
        
    return response

# OCR Wrapper Functions
**The scripts perform an OCR job from AWS Textract, converting tabular data into dataframes**

In [9]:
def trp2df(table:trp.Table) -> pd.DataFrame:
    """
    Function designed to convert a trp table into a dataframe
    :param table: a trp table object parsed from a pdf  
    :return: a DataFrame object housing a textracted trp table
    
    Complexity -> O(n^2)
    """
    N = len(table.rows)               # number of rows in table
    M = len(table.rows[0].cells)      # number of columns in table
    arr = [0]*N
    
    # itterate through each row within the provided table
    for row in range(N):
        
        # strip the text from the cell references to construct (N X M) matrix
        arr[row] = [table.rows[row].cells[col].text.strip() for col in range(M)]
        
    return pd.DataFrame(arr)

In [43]:
def readPDF(response:list) -> pd.DataFrame:
    """
    Function to transform AWS Textract object to a pdf
    :param response: AWS Textract response object
    """
    # in the event multiple tables detected on one page (concat them)
    catDF = []
    
    # format the Textract response type 
    doc = trp.Document(response)
    
    # iterate through document pages
    for page in doc.pages:
        # itterate through page tables
        for table in page.tables: 
            # convert trp-table into dataframe object
            df = trp2df(table)
            
            # remove columns that are completly empty (column 0 = line items)
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
            colIndex = df.columns[0]
            
            # check for the word "cash" in a string at the begining, ignoring case sensitivity
            assetCheck = df[colIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)
            
            # check for the word "Liabilities" in a string at the end, ignoring case sensitivity
            debtCheck1 = df[colIndex].str.contains('Liabilities$|^Liabilities', 
                                                  regex=True, flags=re.IGNORECASE)
            debtCheck2 = df[colIndex].str.contains('Liability$|^Liability', 
                                                  regex=True, flags=re.IGNORECASE)
            
            # check if the key words have been found 
            check1 = df[assetCheck | debtCheck1 | debtCheck2].empty
            check2 = debtCheck1[debtCheck1 == True].empty
            check3 = debtCheck2[debtCheck2 == True].empty
            
            # if figure matched exactly, we assume this is balance sheet 
            if not check1:
                catDF.append(df)
        
                if check2 == False or check3 == False:  
                    return pd.concat(catDF)
                

## Extract Balance Sheet information

In [11]:
subsetFolder = np.array(session.list_s3_files(bucket, 'Input/SubSets/'))[1:]

In [12]:
subsetFolder

array(['Input/SubSets/1000147-02_subset.pdf',
       'Input/SubSets/1000147-03_subset.pdf',
       'Input/SubSets/1000147-04_subset.pdf', ...,
       'Input/SubSets/99947-17_subset.pdf',
       'Input/SubSets/99947-18_subset.pdf',
       'Input/SubSets/99947-19_subset.pdf'], dtype='<U35')

In [54]:
# script to perform OCR (using Textract) for X-17A-5 subsets
outFolder = 'Output/BalanceSheet/'
csvDirectory = np.array(session.list_s3_files(bucket, outFolder))
errorLog = []

# iterate through X-17A-5 subsets stored in s3
for key in subsetFolder[:100]:
    # baseFile name (CIK)-{Year}
    baseFile = key.split('/')[-1].split('_')[0]
    fileName = baseFile+'.csv'
    
    # if file is not found in directory 
    if outFolder+fileName not in csvDirectory:

        # temporary data frame object for balance sheet information
        res = runJob("ran-s3-systemic-risk", key)
        
        if res[0]['JobStatus'] != 'FAILED':
            tempDF = readPDF(res)
            print(tempDF)
            # checks for type of return 
            if type(tempDF) == pd.DataFrame:
                # writing data frame to .csv file
                tempDF.to_csv(fileName)

                # the folder used to export to
                xFolder = 'Output/BalanceSheet/'+fileName

                # save contents to AWS S3 bucket
                with open(fileName, 'rb') as data:
                    s3.put_object(Bucket=bucket, Key=xFolder, Body=data)

                # remove local file after it has been created
                os.remove(fileName)
                print('\tSaved {} file to s3 bucket'.format(baseFile+'.csv'))
            else:
                print('{} no Balance Sheet found'.format(baseFile))
                errorLog.append(baseFile)
        else:
            print('{} could not be parsed'.format(baseFile))
            errorLog.append(baseFile)
    else:
        print('{} has been downloaded'.format(fileName))

print('==========================\nOCR is completed')

1000147-02.csv has been downloaded
1000147-03.csv has been downloaded
1000147-04.csv has been downloaded
1000147-05.csv has been downloaded
1000147-06.csv has been downloaded
1000147-07.csv has been downloaded
1000147-08.csv has been downloaded
1000147-09.csv has been downloaded
1000147-10.csv has been downloaded
1000147-11.csv has been downloaded
1000148-02.csv has been downloaded
1000148-03.csv has been downloaded
1000148-04.csv has been downloaded
1000148-05.csv has been downloaded
1000148-06.csv has been downloaded
1000148-07.csv has been downloaded
1000148-08.csv has been downloaded
1000148-09.csv has been downloaded
1000148-10.csv has been downloaded
1000148-11.csv has been downloaded
1000148-12.csv has been downloaded
1000148-13.csv has been downloaded
1000148-14.csv has been downloaded
1000148-15.csv has been downloaded
Started job with id: 06524b8b94ef7ad72c1acb20cd9ed051009c32d44260cf484937465985191628
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Jo

Started job with id: 60437ad16ecc3886a9bcdeeb67b012e05e548a30d7fe867ba5858a2dd89b7953
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
                                                    0            1
0                                                Cash     $ 68,579
1                              Receivable from broker       17,270
2                                    Clearing deposit      262,220
3     Furniture, equipment and leasehold improvements             
4   (Net of accumulated depreciation and amortizat...      657,355
5                                    Notes Receivable    6,288,610
6                                        Other assets        8,232
7                                        TOTAL ASSETS  $ 7,302,266
8                     LIABILITIES AND MEMBERS' EQUITY             
9                                        Liabilities:             
10              Accounts payable and accrued expenses

Resultset page recieved: 1
Resultset page recieved: 2
                                                    0           1
0                                              Assets            
1                           Cash and cash equivalents    $500,353
2                                         CSC deposit      50,878
3                              Commissions receivable     394,523
4                                   Other receivables      63,907
5               Furniture and equipment, net (Note 2)      73,169
6                                        Other assets     193,771
7                                        Total assets  $1,276,601
8                Liabilities and Stockholder's Equity            
9                                 Commissions payable    $505,078
10              Accounts payable and accrued expenses      80,813
11                                  Total liabilities     585,891
12                               Stockholder's equity            
13       Common stock,

Started job with id: 8458ff47e212cc4020ff57c8c449fde4c8e9ca20b507ed10aa0bd2af9379f755
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
                                      0  1          2
0                                                    
1                     December 31, 2006              
2                                ASSETS              
3             Cash and cash equivalents  $  1,119,982
4                Commissions receivable       630,249
5   Due from registered representatives        91,269
6              Due from clearing broker       164,810
7            Deposit at clearing broker       315,972
8     Securities owned, at market value       205,103
9       Securities owned, at fair value        22,500
10                      D

Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
                                                    0            1
0                                              ASSETS             
1                           Cash and cash equivalents  $ 1,439,247
2                              Commissions receivable      636,974
3                 Due from registered representatives      318,457
4                        Deposits at clearing brokers      355,784
5                           Due from clearing brokers      121,314
6                     Securities owned, at fair value    1,865,705
7                         Property and equipment, net      226,767
8                           Deposits and other asse

Started job with id: a3875ac5fe4f0bcb9974719754e17ccc4aeec824b6c426b77a02010dec913cad
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
                                                    0            1
0                           Cash and cash equivalents  $ 3,715,509
1                              Commissions receivable      915,978
2            Due from registered representatives, net      234,539
3                        Deposits at clearing brokers      799,453
4                           Due from clearing brokers      585,796
5                                   Other receivables      140,666
6                     Securities owned, at fair value    1,612,594
7                         Proper

Started job with id: 31fa11e1aed4ab8aab57e4e3d6dde786941eb1d94ac9b36ca971f7f903a1aeaa
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
                                                    0          1
0                                              Assets           
1                                                Cash  $ 249,857
2          Money market funds held at clearing broker    328,262
3                               Due from an affiliate     12,760
4     Leasehold improvements, furniture and equipment           
5   (Net of accumulated depreciation and amortizat...     81,317
6                                            Deposits    131,021
7                                        Other assets     35,246
8                                        Total assets  $ 838,463
9                Liabilities and Stockholder's Equity           
10                                   Due to affiliate   $ 25,587
11        

Started job with id: 3c222e9473c5100f7f67f8b5189131d74802d73271cdf3221f9bea66f2a7d7d5
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
                                                    0            1
0                                              Assets             
1                           Cash and cash equivalents  $ 1,378,502
2                                     Fail to Deliver       39,151
3                                   Due from customer       10,097
4                Leasehold improvements and equipment             
5   (net of accumulated amortization and depreciat...      167,001
6                                    Security deposit       99,216
7                                  Deferred tax asset       31,700
8                                        Other assets       19,974
9                                        Total assets  $ 1,745,641
10               Liabilities and Stockholder's Equity

Started job with id: 95bd1c05b54edfcd94f8a7b43f4b00a5d8ac56bda17d1553391424bee921fc7f
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
                                                    0            1
0                                                                 
1                                              Assets             
2                                                Cash  $ 2,021,080
3                                     Fail to deliver      199,842
4     Leasehold improvements, furniture and equipment             
5   (net of accumulated amortization and depreciat...        1,881
6                                    Security deposit       99,216
7                                  Deferred tax asset       61,700
8                                        Ot

	Saved 1000316-20.csv file to s3 bucket
Started job with id: edfae6967521e4eeb84a744cdcba4ca156d37792d49068089ae6be19960060ea
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
                                                    0          1
0                           Cash and cash equivalents  $ 192,300
1                     Concessions and fees receivable     59,181
2           Non - marketable securities owned at cost     33,100
3                                        Total assets  $ 284,581
4                Liabilities and Stockholders' Equity           
5                                    Accrued expenses    $ 1,215
6   Illinois personal property replacement tax pay...     16,200
7                                   Total liabilities   $ 17,415
8                                Stockholders' Equity           
9   Common stock, at stated value, (1,000 shares a...      $ 100
10                         Addition

Started job with id: d86f1479871c74149ff750bb0a163c339ad429d2403a6a27a19841864a6dabbe
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
                                                    0          1
0                           Cash and cash equivalents  $ 156,526
1                     Concessions and fees receivable     16,751
2   Marketable securities owned, at cost (identifi...    197,960
3                                        Other assets     23,743
4                                        Total assets  $ 394,980
5                Liabilities and Stockholder's Equity          ,
6                                    Accrued expenses   $ 33,727
7   Illinois personal property replacement tax pay...      8,938
8   Illinois personal property replacement tax pay...      1,800
9                                   Total liabilities   $ 44,465
10                      

	Saved 1000317-13.csv file to s3 bucket
Started job with id: bcc41cd59357449266e9a7ad6e99bad19c2c60ce15493fc7024c1212c2d0a835
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
                                                    0         1
0                  Cash and cash equivalents [Note 1]  $ 58,658
1                        Accounts Receivable [Note 1]     2,891
2                                        Total assets  $ 61,549
3                LIABILITIES AND STOCKHOLDERS' EQUITY          
4                                         Liabilites:          
5                                 Current liabilities   $ 1,728
6                                   Total liabilities   $ 1,728
7                                Stockholders' equity          
8   Common stock - at par value, 1000 shares autho...     $ 100
9                   100 shares issued and outstanding      

	Saved 1000317-19.csv file to s3 bucket
Started job with id: fb99c1153a479f499551223fbc90e2bf0b51b1c5184bea61eb8015cf32fae351
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
                                                    0         1
0                                                Cash  $ 10,701
1                                 Accounts Receivable       156
2                                        Total Assets  $ 10,857
3                LIABILITIES AND STOCKHOLDERS' EQUITY          
4                                Current Liabilities:          
5                                    Accounts Payable   $ 3,000
6                           Total Current Liabilities     3,000
7                                Stockholders' Equity          
8   Common stock - no par value, 100 shares author...       100
9                   100 shares issued and outstanding      

Started job with id: f7cfc6b920438ff9205cf3ed34a50490dd1830042b1fcdcffb57b2eef555b45f
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
                                                    0            1
0                          Federal Income Tax Payable    $ 118,115
1                                     Accrued Expense       13,000
2                            Payable to Related Party      489,562
3                         State and Local Tax Payable       45,634
4                                   TOTAL LIABILITIES      666,311
5                               Shareholder's Equity:             
6   Common Stock $.01 Par Value, 3,000 Shares Auth...             
7                1,000 Shares, Issued and Outstanding           10
8                                     Paid in Capital       44,990
9                                   Retained Earnings      784,299
10

In [55]:
errorLog

['1000148-16', '1000151-03', '1000151-04', '1000151-12', '1000316-16']

In [53]:
res

[{'JobStatus': 'FAILED',
  'StatusMessage': 'INVALID_DOCUMENT_TYPE',
  'AnalyzeDocumentModelVersion': '1.0',
  'ResponseMetadata': {'RequestId': '4d0bd583-be68-4219-950a-bfe48c493b89',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amzn-requestid': '4d0bd583-be68-4219-950a-bfe48c493b89',
    'content-type': 'application/x-amz-json-1.1',
    'content-length': '98',
    'date': 'Mon, 02 Nov 2020 18:59:07 GMT'},
   'RetryAttempts': 0}}]

# Helper Functions

In [15]:
def cleanNumeric(value:str) -> float:
    """
    This function converts a string to a numeric quantity, handles weird string format
    :param: value, string value with hidden numeric quanity  
    :return: floating point values
    
    Complexity -> O(n)
    
    e.g.
        In[0]: $ 19,225     ->   Out[0]: 19255
        In[0]: $ 19,225.76  ->   Out[0]: 19255.76
    """
    arr = ""
    
    # if length of sequence non-empty, iterate to convert string 
    if len(value) > 0:
        for elm in value:
            # retains only numeric related elements
            if (elm == '.') or isfloat(elm):
                arr += elm 
            else:
                # preserves the comma seperator purpose (if present)
                if elm != ',':
                    arr += '0'
                    
    # if sequence empty, simply return zero
    else: arr += '0'   
    
    return float(arr)

In [16]:
def isfloat(value) -> bool:
    """
    This function checks whether an input can be cast to a float
    """
    try:
        int(value)
        return True
    except ValueError: 
        return False

In [39]:
def testCase(res):
    doc = trp.Document(res)
    assetStrike = 0
    catDF = []

    # iterate through document pages
    for page in doc.pages:
        # itterate through page tables
        for table in page.tables: 
            # convert trp-table into dataframe object
            df = trp2df(table)
            
            # remove columns that are completly empty (column 0 = line items)
            empty_cols = [col for col in df.columns if (df[col] == '').all()]
            df = df.drop(empty_cols, axis=1)
            colIndex = df.columns[0]

            # check for the word "cash" in a string at the begining, ignoring case sensitivity
            assetCheck = df[colIndex].str.contains('^Cash', regex=True, flags=re.IGNORECASE)
            
            # check for the word "Liabilities" in a string at the end, ignoring case sensitivity
            debtCheck1 = df[colIndex].str.contains('Liabilities$|^Liabilities', 
                                                  regex=True, flags=re.IGNORECASE)
            debtCheck2 = df[colIndex].str.contains('Liability$|^Liability', 
                                                  regex=True, flags=re.IGNORECASE)
            
            # check if the key words have been found 
            check1 = df[assetCheck | debtCheck1 | debtCheck2].empty
            check2 = debtCheck1[debtCheck1 == True].empty
            check3 = debtCheck2[debtCheck2 == True].empty
            
            # if figure matched exactly, we assume this is balance sheet 
            if not check1:
                catDF.append(df)
        
                if check2 == False or check3 == False:  
                    return pd.concat(catDF)
                
#     return pd.concat(catDF)


In [46]:
# 'Input/SubSets/7470-04_subset.pdf' # natixis report, absolutley horrible 
# 'Input/SubSets/1002201-02_subset.pdf' # reads wrong number for total liabilites and member equity

restype = runJob("ran-s3-systemic-risk", 'Input/SubSets/1000147-02_subset.pdf')
df = testCase(restype)

Started job with id: 1858cd460048abc2c3afe48ffaeef8500265509753b68fe2015336cd72d66f09
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
