# Amazon Textract Workshop


In [None]:
%pwd

In [None]:
!python -m pip install amazon-textract-caller --upgrade
!python -m pip install amazon-textract-response-parser --upgrade

In [None]:
import boto3
from IPython.display import Image, display
from trp import Document
from PIL import Image as PImage, ImageDraw
import time
from IPython.display import IFrame

In [None]:
# Amazon Textract client
textract = boto3.client('textract')

# 1. Detect text from local image

In [None]:
# Document
documentName = "simple-document-image.jpeg"

In [None]:
display(Image(filename=documentName))

In [None]:
# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

# Call Amazon Textract
response = textract.detect_document_text(Document={'Bytes': imageBytes})

# Print detected text
for item in response["Blocks"]:
    if item["BlockType"] == "LINE":
        print (item["Text"])

## Lines and Words of Text - JSON Structure

In [None]:
# Document
documentName = "OneLine.png"
display(Image(filename=documentName))
# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

# Call Amazon Textract
response = textract.detect_document_text(Document={'Bytes': imageBytes})

# Print detected text
for item in response["Blocks"]:
    if item["BlockType"] == "LINE":
        print (item["Text"])

In [None]:
print("JSON Response\n===================")
display(response)

# 2. Reading order

In [None]:
# Document
documentName = "two-column-image.jpeg"

In [None]:
display(Image(filename=documentName))

In [None]:
# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

In [None]:
# Call Amazon Textract
response = textract.detect_document_text(Document={'Bytes': imageBytes})

# Detect columns and print lines
columns = []
lines = []
for item in response["Blocks"]:
      if item["BlockType"] == "LINE":
        column_found=False
        for index, column in enumerate(columns):
            bbox_left = item["Geometry"]["BoundingBox"]["Left"]
            bbox_right = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]
            bbox_centre = item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]/2
            column_centre = column['left'] + column['right']/2

            if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right):
                #Bbox appears inside the column
                lines.append([index, item["Text"]])
                column_found=True
                break
        if not column_found:
            columns.append({'left':item["Geometry"]["BoundingBox"]["Left"], 'right':item["Geometry"]["BoundingBox"]["Left"] + item["Geometry"]["BoundingBox"]["Width"]})
            lines.append([len(columns)-1, item["Text"]])

lines.sort(key=lambda x: x[0])
for line in lines:
    print (line[1])

# 3. Forms: Key/Values

In [None]:
# Document
documentName = "employmentapp.png"

In [None]:
display(Image(filename=documentName))

# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

In [None]:
# Call Amazon Textract
response = textract.analyze_document(
    Document={'Bytes': imageBytes},
    FeatureTypes=["FORMS"])

#print(response)

doc = Document(response)

for page in doc.pages:
    # Print fields
    print("Fields:")
    for field in page.form.fields:
        print("Key: {}, Value: {}".format(field.key, field.value))

    # Get field by key
    print("\nGet Field by Key:")
    key = "Phone Number:"
    field = page.form.getFieldByKey(key)
    if(field):
        print("Key: {}, Value: {}".format(field.key, field.value))

    # Search fields by key
    print("\nSearch Fields:")
    key = "address"
    fields = page.form.searchFieldsByKey(key)
    for field in fields:
        print("Key: {}, Value: {}".format(field.key, field.value))

## Form Data (Key-Value Pairs) JSON Structure

In [None]:
# Document
documentName = "OneKeyValue.png"
display(Image(filename=documentName))
# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

# Call Amazon Textract
response = textract.analyze_document(Document={'Bytes': imageBytes}, FeatureTypes=["FORMS"])

doc = Document(response)

for page in doc.pages:
    # Print fields
    print("Fields:")
    for field in page.form.fields:
        print("Key: {}, Value: {}".format(field.key, field.value))

In [None]:
print("JSON Response\n===================")
display(response)

# 4. Control and Compliance - Redaction

In [None]:
# Document
documentName = "employmentapp.png"

In [None]:
display(Image(filename=documentName))

In [None]:
# Call Amazon Textract
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

# Call Amazon Textract
response = textract.analyze_document(Document={'Bytes': imageBytes}, FeatureTypes=["FORMS"])

#print(response)

doc = Document(response)

# Redact document
img = PImage.open(documentName)

width, height = img.size

if(doc.pages):
    page = doc.pages[0]
    for field in page.form.fields:
        if(field.key and field.value and "address" in field.key.text.lower()):
            print("Redacting => Key: {}, Value: {}".format(field.key.text, field.value.text))
            
            x1 = field.value.geometry.boundingBox.left*width
            y1 = field.value.geometry.boundingBox.top*height-2
            x2 = x1 + (field.value.geometry.boundingBox.width*width)+5
            y2 = y1 + (field.value.geometry.boundingBox.height*height)+2

            draw = ImageDraw.Draw(img)
            draw.rectangle([x1, y1, x2, y2], fill="Black")

outputDocumentName = "redacted-{}".format(documentName)
img.save(outputDocumentName)
display(Image(filename=outputDocumentName))

# 5. Tables

In [None]:
# Document
documentName = "employmentapp.png"

In [None]:
display(Image(filename=documentName))

# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

In [None]:
# Call Amazon Textract
response = textract.analyze_document(
    Document={'Bytes': imageBytes},
    FeatureTypes=["TABLES"])

#print(response)

doc = Document(response)

for page in doc.pages:
     # Print tables
    for table in page.tables:
        for r, row in enumerate(table.rows):
            for c, cell in enumerate(row.cells):
                print("Table[{}][{}] = {}".format(r, c, cell.text))

# 6. Table Data Validation

In [None]:
# Document
documentName = "expense.png"

In [None]:
display(Image(filename=documentName))

# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

In [None]:
# Call Amazon Textract
response = textract.analyze_document(
    Document={'Bytes': imageBytes},
    FeatureTypes=["TABLES"])

#print(response)

doc = Document(response)

def isFloat(input):
  try:
    float(input)
  except ValueError:
    return False
  return True

warning = ""
for page in doc.pages:
     # Print tables
    for table in page.tables:
        for r, row in enumerate(table.rows):
            itemName  = ""
            for c, cell in enumerate(row.cells):
                print("Table[{}][{}] = {}".format(r, c, cell.text))
                if(c == 0):
                    itemName = cell.text
                elif(c == 4 and isFloat(cell.text)):
                    value = float(cell.text)
                    if(value > 1000):
                        warning += "{} is greater than $1000.".format(itemName)
if(warning):
    print("\nReview needed:\n====================\n" + warning)

# 7. Invoices and Receipts processing

In [None]:
# Document
documentName = "expense.png"

In [None]:
display(Image(filename=documentName))

# Read document content
with open(documentName, 'rb') as document:
    imageBytes = bytearray(document.read())

In [None]:
# Call Amazon Textract
response = textract.analyze_expense(Document={'Bytes': imageBytes})

In [None]:

summary_entities_values = []
summary_fields = []
expense_item = []

for expense_doc in response["ExpenseDocuments"]:
    for field in expense_doc["SummaryFields"]:
        kvs = {}
        if "LabelDetection" in field:
            if "ValueDetection" in field:
                kvs[field["LabelDetection"]["Text"]] = field["ValueDetection"]["Text"]
        else:
            kvs[field["Type"]["Text"]] = field["ValueDetection"]["Text"]
        summary_entities_values.append(kvs.copy())
        kvs = None

    for line_item_group in expense_doc["LineItemGroups"]:
            for line_items in line_item_group["LineItems"]:
                for field in line_items["LineItemExpenseFields"]:
                    kvs = {}
                    if "LabelDetection" in field:
                        if "ValueDetection" in field:
                            kvs[field["LabelDetection"]["Text"]] = field["ValueDetection"]["Text"]
                    else:
                        kvs[field["Type"]["Text"]] = field["ValueDetection"]["Text"]
                    expense_item.append(kvs.copy())
                    kvs = None
print("Summary Items:\n")
print(*summary_entities_values, sep='\n')
print("\nExpense Items:\n")
print(*expense_item, sep='\n')

# 8. Textract PrettyPrinter

In [None]:
#new image
imageName="patient_intake_form_sample.jpeg"

#display the image
from IPython.display import Image
Image(imageName)

In [None]:
!python -m pip install amazon-textract-prettyprinter

In [None]:
#format Textract output and print in CSV format 
from textractprettyprinter.t_pretty_print import Pretty_Print_Table_Format, Textract_Pretty_Print, get_string, get_tables_string
from textractcaller.t_call import Textract_Features, Textract_Types, call_textract

textract_json = call_textract(input_document= imageName, features=[Textract_Features.FORMS, Textract_Features.TABLES])
print(get_string(textract_json=textract_json,
               table_format=Pretty_Print_Table_Format.csv,
               output_type=[Textract_Pretty_Print.FORMS, Textract_Pretty_Print.TABLES]))

In [None]:
#call Textract
j = call_textract(input_document=imageName, features=[Textract_Features.FORMS])

#Print the key/value pairs to identify the ones that have the same name. 
from textractprettyprinter.t_pretty_print import get_forms_string
print(get_forms_string(j))