In [1]:
import pandas as pd

import boto3
import io
from PIL import Image, ImageDraw
import json

# to print out all the outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
def draw_bounding_box(key, val, width, height, draw):
    # If a key is Geometry, draw the bounding box info in it
    if "Geometry" in key:
        # Draw bounding box information
        box = val["BoundingBox"]
        left = width * box['Left']
        top = height * box['Top']
        draw.rectangle([left, top, left + (width * box['Width']), top + (height * box['Height'])],
                       outline='black')

In [3]:
# Takes a field as an argument and prints out the detected labels and values
def print_labels_and_values(field):
    # Only if labels are detected and returned
    if "LabelDetection" in field:
        print("Summary Label Detection - Confidence: {}".format(
            str(field.get("LabelDetection")["Confidence"])) + ", "
              + "Summary Values: {}".format(str(field.get("LabelDetection")["Text"])))
#         print(field.get("LabelDetection")["Geometry"])
    else:
        print("Label Detection - No labels returned.")
    if "ValueDetection" in field:
        print("Summary Value Detection - Confidence: {}".format(
            str(field.get("ValueDetection")["Confidence"])) + ", "
              + "Summary Values: {}".format(str(field.get("ValueDetection")["Text"])))
#         print(field.get("ValueDetection")["Geometry"])
    else:
        print("Value Detection - No values returned")

In [4]:
def process_text_detection(bucket, document):
    # Get the document from S3
    s3_connection = boto3.resource('s3')
    s3_object = s3_connection.Object(bucket, document)
    s3_response = s3_object.get()

    # opening binary stream using an in-memory bytes buffer
    stream = io.BytesIO(s3_response['Body'].read())

    # loading stream into image
    image = Image.open(stream)

    # Detect text in the document
    client = boto3.client('textract', region_name="us-east-2")

    # process using S3 object
    response = client.analyze_expense(
        Document={'S3Object': {'Bucket': bucket, 'Name': document}})

    # Set width and height to display image and draw bounding boxes
    # Create drawing object
    width, height = image.size
    draw = ImageDraw.Draw(image)

    for expense_doc in response["ExpenseDocuments"]:
        for line_item_group in expense_doc["LineItemGroups"]:
            for line_items in line_item_group["LineItems"]:
                for expense_fields in line_items["LineItemExpenseFields"]:
                    print_labels_and_values(expense_fields)
                    print()

        print("Summary:")
        for summary_field in expense_doc["SummaryFields"]:
            print_labels_and_values(summary_field)
            print()

        #For draw bounding boxes
        for line_item_group in expense_doc["LineItemGroups"]:
            for line_items in line_item_group["LineItems"]:
                for expense_fields in line_items["LineItemExpenseFields"]:
                    for key, val in expense_fields["ValueDetection"].items():
                        if "Geometry" in key:
                            draw_bounding_box(key, val, width, height, draw)

        for label in expense_doc["SummaryFields"]:
            if "LabelDetection" in label:
                for key, val in label["LabelDetection"].items():
                    draw_bounding_box(key, val, width, height, draw)

    # Display the image
    image.save("../data/out/" + document)
    
    return response

In [5]:
bucket = 'aws-textract-test-invoice'
document = 'Sample Document for Amazon Textract 1.jpg'

In [6]:
stuff = process_text_detection(bucket, document)

Summary Label Detection - Confidence: 98.62287902832031, Summary Values: QUANTITY
Summary Value Detection - Confidence: 99.665283203125, Summary Values: 3

Summary Label Detection - Confidence: 98.5995101928711, Summary Values: DESCRIPTION
Summary Value Detection - Confidence: 99.5957260131836, Summary Values: Testing Description 1

Summary Label Detection - Confidence: 98.58100128173828, Summary Values: UNIT PRICE
Summary Value Detection - Confidence: 99.74369812011719, Summary Values: $3,754.78

Summary Label Detection - Confidence: 98.4854507446289, Summary Values: LINE TOTAL
Summary Value Detection - Confidence: 99.70228576660156, Summary Values: $11,264.34

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.64972686767578, Summary Values: 3 Testing Description 1 $3,754.78 $11,264.34

Summary Label Detection - Confidence: 98.62287902832031, Summary Values: QUANTITY
Summary Value Detection - Confidence: 99.73505401611328, Summary Values: 1

Summary Label 

In [7]:
for expense_doc in stuff["ExpenseDocuments"]:
    for line_item_group in expense_doc["LineItemGroups"]:
        for line_items in line_item_group["LineItems"]:
            for expense_fields in line_items["LineItemExpenseFields"]:
                print_labels_and_values(expense_fields)
                print()

    print("Summary:")
    for summary_field in expense_doc["SummaryFields"]:
        print_labels_and_values(summary_field)
        print()

Summary Label Detection - Confidence: 98.62287902832031, Summary Values: QUANTITY
Summary Value Detection - Confidence: 99.665283203125, Summary Values: 3

Summary Label Detection - Confidence: 98.5995101928711, Summary Values: DESCRIPTION
Summary Value Detection - Confidence: 99.5957260131836, Summary Values: Testing Description 1

Summary Label Detection - Confidence: 98.58100128173828, Summary Values: UNIT PRICE
Summary Value Detection - Confidence: 99.74369812011719, Summary Values: $3,754.78

Summary Label Detection - Confidence: 98.4854507446289, Summary Values: LINE TOTAL
Summary Value Detection - Confidence: 99.70228576660156, Summary Values: $11,264.34

Label Detection - No labels returned.
Summary Value Detection - Confidence: 99.64972686767578, Summary Values: 3 Testing Description 1 $3,754.78 $11,264.34

Summary Label Detection - Confidence: 98.62287902832031, Summary Values: QUANTITY
Summary Value Detection - Confidence: 99.73505401611328, Summary Values: 1

Summary Label 

In [29]:
stuff['ExpenseDocuments'][0]['SummaryFields'][0]['Type']['Confidence']

81.0

In [57]:
for expense_doc in stuff["ExpenseDocuments"]:
    for summary_field in expense_doc["SummaryFields"]:
        confidence = summary_field.get("LabelDetection")["Confidence"]
        if confidence < 90:
            if "LabelDetection" in summary_field:
                print("{}:".format(str(summary_field.get("LabelDetection")["Text"])))
            else:
                print("Label Detection - No labels returned.")
            if "ValueDetection" in summary_field:
                print("{}".format(str(summary_field.get("ValueDetection")["Text"])) + " - {}".format(str(summary_field.get("LabelDetection")["Confidence"])))
            else:
                print("Value Detection - No values returned")
            print()

JOB:
#00435 - 80.94031524658203

SALESPERSON:
Jacob Johnson - 68.52052307128906

Tech Skills (dot) Ninja:
12395 Skye Park
San Antonio TX 78243
(210) 123-4567 - 39.85511779785156

