In [1]:
#Detects text in a document stored in an S3 bucket. Display polygon box around text and angled text 
import boto3
import io
from io import BytesIO
import sys
import os

# import psutil
import time

import math
from PIL import Image, ImageDraw, ImageFont


# Displays information about a block returned by text detection and text analysis
def DisplayBlockInformation(block):
    print('Id: {}'.format(block['Id']))
    if 'Text' in block:
        print('    Detected: ' + block['Text'])
    print('    Type: ' + block['BlockType'])
   
    if 'Confidence' in block:
        print('    Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

    if block['BlockType'] == 'CELL':
        print("    Cell information")
        print("        Column:" + str(block['ColumnIndex']))
        print("        Row:" + str(block['RowIndex']))
        print("        Column Span:" + str(block['ColumnSpan']))
        print("        RowSpan:" + str(block['ColumnSpan']))    
    
    if 'Relationships' in block:
        print('    Relationships: {}'.format(block['Relationships']))
    print('    Geometry: ')
    print('        Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
    print('        Polygon: {}'.format(block['Geometry']['Polygon']))
    
    if block['BlockType'] == "KEY_VALUE_SET":
        print ('    Entity Type: ' + block['EntityTypes'][0])
    if 'Page' in block:
        print('Page: ' + block['Page'])
    print()

def process_text_detection(image_path):
    image = Image.open(image_path)
    # Detect text in the document
    #process using image bytes                      
    #image_binary = stream.getvalue()
    #response = client.detect_document_text(Document={'Bytes': image_binary})
    
    image_bytes = None
    with open(image_path, 'rb') as document:
        image_bytes = bytearray(document.read())
    textract = boto3.client('textract', region_name='us-east-2')
    response = textract.analyze_document(Document={'Bytes': image_bytes}, FeatureTypes=["FORMS"])

    #Get the text blocks
    blocks=response['Blocks']
    width, height =image.size  
    draw = ImageDraw.Draw(image)  
    print ('Detected Document Text')
   
    # Create image showing bounding box/polygon the detected lines/text
    for block in blocks:
            print('Type: ' + block['BlockType'])
            if block['BlockType'] != 'PAGE':
                print('Detected: ' + block['Text'])
                print('Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

            print('Id: {}'.format(block['Id']))
            if 'Relationships' in block:
                print('Relationships: {}'.format(block['Relationships']))
            print('Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
            print('Polygon: {}'.format(block['Geometry']['Polygon']))
            print()
            draw=ImageDraw.Draw(image)
            # Draw WORD - Green -  start of word, red - end of word
            if block['BlockType'] == "WORD":
                draw.line([(width * block['Geometry']['Polygon'][0]['X'],
                height * block['Geometry']['Polygon'][0]['Y']),
                (width * block['Geometry']['Polygon'][3]['X'],
                height * block['Geometry']['Polygon'][3]['Y'])],fill='green',
                width=2)
            
                draw.line([(width * block['Geometry']['Polygon'][1]['X'],
                height * block['Geometry']['Polygon'][1]['Y']),
                (width * block['Geometry']['Polygon'][2]['X'],
                height * block['Geometry']['Polygon'][2]['Y'])],
                fill='red',
                width=2)    

                 
            # Draw box around entire LINE  
            if block['BlockType'] == "LINE":
                points=[]

                for polygon in block['Geometry']['Polygon']:
                    points.append((width * polygon['X'], height * polygon['Y']))

                draw.polygon((points), outline='black')    
  
                # Uncomment to draw bounding box
                #box=block['Geometry']['BoundingBox']                    
                #left = width * box['Left']
                #top = height * box['Top']           
                #draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],outline='black') 


    # Display the image
    image.show()
    # display image for 10 seconds

    
    return len(blocks)


process_text_detection("../OneDrive_1_11-6-2019/iCard_021873_1_Daba_Ayehush_H.jpg")



Detected Document Text
Type: PAGE
Id: 5525e70a-74c2-4b66-8ceb-e16be10acc63
Relationships: [{'Type': 'CHILD', 'Ids': ['2d61e814-622a-498a-b77f-ed00fac2d9d1', 'b9b3a44d-8850-47a9-a894-c9d2e857e469', '9f5032f2-1ae0-4091-9d74-9bfef1e124b8', 'b3c4e553-e0c2-4032-880f-71e41941618a', '5a33df1b-3e9a-4b9f-af74-8b0d92e87a47', 'fcccf547-8901-4ea6-a2c1-afe76d633e37', 'a1497f00-0700-4670-9807-de9abe1044a8', '68450444-8fc4-4c8c-ba73-d65872c94d32', 'd0eb7e94-6d90-4128-b74a-2c6c3eaf32bc', '127250ec-a8e1-47d5-a67b-40bfc41f0596', '45e0c17f-169d-4ea8-99b1-b991c6b7d739', '0bfca6e4-1190-48cc-9166-982cb072dd34', '5c99de78-35c9-4c61-ba03-ce4c8875193c', '3527025b-825c-4237-bd66-99adaf2263de', '10611446-54a7-4a32-89b3-706fb4495c15', '83e9c347-59da-4831-9240-d31975b72350', 'c079503f-6971-4493-a65f-ba04c8a0adf5', '691e43f0-4e37-4616-9b1c-bb442c3273f9', 'a0ffe59d-e5a0-4ce2-8e30-33de0214b114', 'd0bc0b6c-239f-4b90-85a4-4463f27b3bd9', '03cf6ad7-e36e-4d6d-a4b4-1d3f38a5998c', '64554d69-f0d5-4f71-ab23-7be10638b56b', '76

KeyError: 'Text'

In [3]:
import boto3
import os
import pandas as pd

# Amazon Textract client
textract = boto3.client('textract')

#print(response)

all_text_detections = []
files = os.listdir("../OneDrive_1_11-6-2019")

i = 1
for filename in files:
    print(filename)
    if not os.path.isfile(f"../OneDrive_1_11-6-2019/{filename}"):
        continue
        
    if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        continue
    with open(f"../OneDrive_1_11-6-2019/{filename}", 'rb') as document:
        imageBytes = bytearray(document.read())
    # Call Amazon Textract
    response = textract.detect_document_text(Document={'Bytes': imageBytes})
    # Print detected text
    full_text = ""
    for item in response["Blocks"]:
        if item["BlockType"] == "LINE":
            full_text += f"{item['Text']}\n"
    text_detection = {
        "filename": filename,
        "text": full_text
    }
    print(f"{i} / {len(files)} Processed")
    i += 1
    all_text_detections.append(text_detection)
    

df = pd.DataFrame(all_text_detections)

df.to_csv("first_attempt_amazon_textract.csv")

iCard_021875_1_Daba_Shorro.jpg
1 / 123 Processed
iCard_021960_1.jpg
2 / 123 Processed
iCard_021912_1_Dahlberg_John.jpg
3 / 123 Processed
iCard_021894_1_Daehn_Herman_W.jpg
4 / 123 Processed


NameError: name 'pd' is not defined