In [1]:
%%bash
pip install --upgrade pip
pip install smart_open minecart
pip install textract-trp

Collecting pip
  Using cached pip-20.2.3-py2.py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.0.2
    Uninstalling pip-20.0.2:
      Successfully uninstalled pip-20.0.2
Successfully installed pip-20.2.3
Collecting smart_open
  Downloading smart_open-3.0.0.tar.gz (113 kB)
Collecting minecart
  Downloading minecart-0.3.0-py3-none-any.whl (23 kB)
Collecting pdfminer3k
  Downloading pdfminer3k-1.3.4-py3-none-any.whl (100 kB)
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py): started
  Building wheel for smart-open (setup.py): finished with status 'done'
  Created wheel for smart-open: filename=smart_open-3.0.0-py3-none-any.whl size=107097 sha256=bdbd0ac74bf867d8bcbbac66a75648a7d890ce0f14406a171bacd171c21ddf96
  Stored in directory: /home/ec2-user/.cache/pip/wheels/88/2a/d4/f2e9023989d4d4b3574f268657cb6cd23994665a038803f547
Successfully built smart-open
Installing coll

In [2]:
import boto3
from trp import Document, Page, BoundingBox
from PIL import Image, ImageDraw, ImageFont
from smart_open import open
from sagemaker.session import Session
import minecart
import pandas as pd
%matplotlib inline

In [38]:
import time 
import random 
import numpy as np

In [60]:
# initiate s3 bucket and corresponding data folder
bucket = "ran-s3-systemic-risk"
data_folder ="Input/X-17A-5/"
# balance_sheet_idx = 3  # the page of the report you care about

# Amazon Textract client and Sagemaker session
textract = boto3.client('textract')
session = Session()

# discover all of the pdfs that you want to parse
paths = np.array(session.list_s3_files(bucket, data_folder))
subset = 20
test_key = np.random.choice(paths, subset)

In [61]:
# random sample of files from s3
test_key

array(['Input/X-17A-5/710858-16.pdf', 'Input/X-17A-5/276721-18.pdf',
       'Input/X-17A-5/62011-12.pdf', 'Input/X-17A-5/822648-12.pdf',
       'Input/X-17A-5/56355-09.pdf', 'Input/X-17A-5/9318-11.pdf',
       'Input/X-17A-5/754651-06.pdf', 'Input/X-17A-5/700087-07.pdf',
       'Input/X-17A-5/12406-16.pdf', 'Input/X-17A-5/855716-05.pdf',
       'Input/X-17A-5/818043-04.pdf', 'Input/X-17A-5/215448-19.pdf',
       'Input/X-17A-5/200401-05.pdf', 'Input/X-17A-5/700078-12.pdf',
       'Input/X-17A-5/810121-08.pdf', 'Input/X-17A-5/700162-11.pdf',
       'Input/X-17A-5/780132-16.pdf', 'Input/X-17A-5/50825-18.pdf',
       'Input/X-17A-5/789994-03.pdf', 'Input/X-17A-5/68136-02.pdf'],
      dtype='<U27')

In [11]:
def textract_from_s3(bucket_name, key, textract_client=textract) -> Document:
    """
    Gets an analyzed document from textract from an initialized client
    :param bucket_name:
    :param key:
    :param textract_client:
    :return:
    """
    r = textract_client.analyze_document(
            Document={
                'S3Object': {
                    'Bucket': bucket_name,
                    'Name': key
                }
            },
            FeatureTypes=["TABLES"])

    return Document(r)

def get_image_from_pdf(target_pdf_path, page_num=0, image_num=0) -> Image:
    """
    Gets a single image from a pdf, given a page and image position
    :param target_pdf_path:
    :param page_num:
    :param image_num:
    :return:
    """
    with open(target_pdf_path, 'rb') as f:
        try:
            image = minecart.Document(f).get_page(page_num).images[image_num].as_pil()
            return image
        except UnicodeError or IOError as e:
            print("issue getting the image")


def draw_bbox(bbox: BoundingBox, img_width, img_height):
    """
    using ImageDraw, draw a red outline around a Cell's boundingbox
    :param bbox:
    :param img_width:
    :param img_height:
    :return:
    """
    x1 = bbox.left * img_width
    y1 = bbox.top * img_height-2
    x2 = x1 + (bbox.width * img_width) + 5
    y2 = y1 + (bbox.height * img_height) + 2

    draw = ImageDraw.Draw(img)
    draw.rectangle([x1, y1, x2, y2], fill=None, outline=(255, 0, 0, 100))
    draw.text((x1, y1), text, fill="blue", font=ImageFont.load_default())


def parse_balance_sheet(document_page:Page, columns=("YEAR1", "YEAR2")) -> pd.DataFrame:
    """
    This is your function for parsing a certain page of interest
    :param document_page:
    :param columns:
    :return:
    """
    rows = []  # list of lists [[cell1, cell2,...]]
    for table in document_page.tables:
        for r, row in enumerate(table.rows):
            assert len(row.cells) == columns, "The length of column names and the actual columns don't match"
            data = []
            for c, cell in enumerate(row.cells):
                print("Table[{}][{}] = {}".format(r, c, cell.text))
                data.append(cell.text)
                if c == 0:
                    item_name = cell.text
                    print(r, "FIRST CELL ", item_name)
            rows.append(data)

    df = pd.DataFrame(rows, columns=columns)

    return df


In [93]:
# content modified from Amazon AWS Textract repository (refer to URL below)
# https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/12-pdf-text.py

def startJob(s3BucketName:str, objectName:str) -> str:
    """
    Starts a Textract job on AWS server 
    """
    response = None
    client = boto3.client('textract')
    
    # issue response to AWS to start Textract job for table analysis 
    response = client.start_document_analysis(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3BucketName,
                'Name': objectName
            }
        },
        FeatureTypes=['TABLES']
    )
    
    # return response job ID for service
    return response["JobId"]

def isJobComplete(jobId:str) -> str:
    """
    Tracks the completion status of the Textract job when qued
    """
    time.sleep(5)
    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    status = response["JobStatus"]
    print("Job status: {}".format(status))
    
    # check current status of AWS job (ask server every 5 seconds for data)
    while(status == "IN_PROGRESS"):
        time.sleep(5)                   # lag before reporting status
        response = client.get_document_analysis(JobId=jobId)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

def getJobResults(jobId:str) -> list:
    """
    Returns the contents of the Textract job, after completion status met
    """
    pages = []          # initialize list object to track pages

    client = boto3.client('textract')
    response = client.get_document_analysis(JobId=jobId)
    
    pages.append(response)
    print("Resultset page recieved: {}".format(len(pages)))
    
    nextToken = None
    if('NextToken' in response):
        nextToken = response['NextToken']
    
    # iterate through the pages and append to response figure
    while(nextToken):

        response = client.get_document_analysis(JobId=jobId, NextToken=nextToken)

        pages.append(response)
        print("Resultset page recieved: {}".format(len(pages)))
        nextToken = None
        if('NextToken' in response):
            nextToken = response['NextToken']

    return pages

In [94]:
test_key[:1]

array(['Input/X-17A-5/710858-16.pdf'], dtype='<U27')

In [95]:
from pprint import pprint

In [96]:
searchList = ["Total assets", "Total liabilities", "Total stockholder's equity"]

In [97]:
count = []

for key in test_key[:1]:
    # S3 storage for files on AWS site    
    jobId = startJob(bucket, key)
    print("Started job with id: {}".format(jobId))
    if(isJobComplete(jobId)):
        response = getJobResults(jobId)
                        

Started job with id: 1239d9c9255ac84def96beeab5a02a7bf7d032967eec2635a770563bda22bc91
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page recieved: 1
Resultset page recieved: 2
Resultset page recieved: 3
Resultset page recieved: 4
Resultset page recieved: 5
Resultset page recieved: 6


In [98]:
#  Print detected text from PDF files
# for resultPage in response:

#     if resultPage['JobStatus'] != 'FAILED':
#         for item in resultPage["Blocks"]:
#             if item["BlockType"] == "LINE":
#                 print(item['Text'])
#                 if item['Text'] in searchList:
#                     if item['Page'] > 10:
#                         print(key)
#                     count.append(item['Page'])

In [99]:
response[0]

{'DocumentMetadata': {'Pages': 23},
 'JobStatus': 'SUCCEEDED',
 'NextToken': 'uMsgQZEvZ9VnMpVRl+y/0FQOqWAOMJrKJk0lkFevD6pg8ni7VVRQDpWVPBuBTWkcSZ9AwtvaRo1HEt+LNEsRoRm/dNsrDpX6RQTFaXPV7IX9mDE7WQ==',
 'Blocks': [{'BlockType': 'PAGE',
   'Geometry': {'BoundingBox': {'Width': 1.0,
     'Height': 1.0,
     'Left': 0.0,
     'Top': 0.0},
    'Polygon': [{'X': 0.0, 'Y': 0.0},
     {'X': 1.0, 'Y': 0.0},
     {'X': 1.0, 'Y': 1.0},
     {'X': 0.0, 'Y': 1.0}]},
   'Id': '2812428c-19f3-4b35-b32f-3e85209e5159',
   'Relationships': [{'Type': 'CHILD',
     'Ids': ['8fba09da-a427-4810-8907-a2bcc4a149e7',
      '4f4f0dde-efd8-4e6b-bba7-767f7b6c0f7a',
      '0bcaf364-3922-442c-a8bd-0c1645365511',
      '1e10c2e0-fd7d-404b-b342-34aad25685dd',
      '9000e9f1-05bc-4db4-927f-71cff9cf167b',
      'e041e794-b4c0-48e9-b253-d8304779e3e2',
      'ffb88359-778a-4f91-a450-dbe1cd55000e',
      'ca644b50-c590-4c46-bf71-19ff57be57cc',
      '442cb1ba-b0a1-4f0e-9254-172f3fa55608',
      '99971ad8-cd81-4684-848e-36b00e

In [65]:
# sub-sample of files finding avg page number (50 / 3300 ~ 1.5%)
# BARING OUTLIER IT APPEARS THAT THE OPTIMAL CEILING IS 10
# mean = np.mean(np.array(count))
# std = np.std(np.array(count))
# print('Avg Balance sheet info from subset of {} files is {}+{}'.format(subset, round(mean, 2),
#                                                                        round(std/np.sqrt(len(count)))))

In [41]:
# Response type organized by dictionary format, major keys of focus for BlockType
# Text data for output from Textract
# Confidence for probability of correctness
# Page for the page number being read from the pdf

In [83]:
# Call Amazon Textract - you can call this from a function,
# docs are here: https://docs.aws.amazon.com/textract/latest/dg/what-is.html
response = textract.analyze_document(
    Document={
        'S3Object': {
            'Bucket': bucket,
            'Name': test_key[0]
        }
    },
    FeatureTypes=["TABLES"])

doc = Document(response)
print(doc)
# then do something with the response

UnsupportedDocumentException: An error occurred (UnsupportedDocumentException) when calling the AnalyzeDocument operation: Request has unsupported document format

In [None]:
# Work in Progress, get an image.
# Minecart doesn't seem to work on this kernel, so commenting it out
# test_path = f's3://{bucket}/{test_key}'
# img = get_image_from_pdf(test_path, page_num=balance_sheet_idx, image_num=0)
# width, height = img.size
# print(width, height)
# # save a copy for later
# img.save(f'{test_key}-{balance_sheet_idx}.jpg')

In [17]:
# this is how to iterate over the Document
# if you have an image open, you can draw the bounding boxes over it

for i, page in enumerate(doc.pages):
    # optional: filter for a page you care about
    if i != balance_sheet_idx:
        continue

    # You could pass this page into a function if you know how to parse it
    # df = parse_balance_sheet(page, columns=["mycolumn-one", "mycolumn-two"])

    # Print tables' cells
    for table in page.tables:
        for r, row in enumerate(table.rows):
            print(row)
            for c, cell in enumerate(row.cells):
                print("Table[{}][{}] = {}".format(r, c, cell.text))
                text = cell.text
                # do something with the text

                # optionally, draw a bounding box around a cell
                # geo: Geometry = cell.geometry
                # bbox: BoundingBox = geo.boundingBox
                # draw_bbox(bbox, img_width=width, img_height=height)

# save the newly annotated image
# img.save("bboxed-test_image.jpg")

[][AS OF ][MARCH 31, ]
[][2006 ][2005 ]
[ASSETS: ][][]
[Cash and cash equivalents ][$ 14,652 ][$ 19,543 ]
[Commissions receivable ][15,124 ][20,749 ]
[TOTAL ASSETS ][$ 29,776 ][S$ 40,292 ]
[LIABILITIES AND STOCKHOLDER'S ][EQUITY ][]
[LIABILITIES: ][][]
[Commissions payable ][$ 12,099 ][$ 16,599 ]
[Total liabilities ][12,099 ][16.599 ]
[STOCKHOLDER'S EQUITY: ][][]
[Common stock, $.01 par value; 1,000 shares authorized, 1,000 shares issued and outstanding ][10 ][10 ]
[Additional paid-in capital ][9,990 ][9,990 ]
[Retained earnings ][7,677 ][13.693 ]
[Total stockholder's equity ][17,677 ][23.693 ]
[TOTAL LIABILITIES AND STOCKHOLDER'S EQUITY ][$ 29,776 ][S 40,292 ]


In [None]:
# Call Amazon Textract on a set of paths and do something with it
for path in paths:

    doc = textract_from_s3(bucket, path)
    # parse page table into csv, if we have a template for it, otherwise print the table
    for i, page in enumerate(doc.pages):
        if i == balance_sheet_idx:
            df = parse_balance_sheet(page)
            df.to_csv(f"{path}-{i}.csv")
        else:
            print("skipping page", i)

