In [10]:
#Detects text in a document stored in an S3 bucket. Display polygon box around text and angled text 
import boto3
import io
from io import BytesIO
import sys
import os

# import psutil
import time

import math
from PIL import Image, ImageDraw, ImageFont

textract = boto3.client(
    'textract',
    region_name='us-east-2',
    aws_access_key_id="aws_access_key_id", 
    aws_secret_access_key="aws_secret_access_key", 
)


# Displays information about a block returned by text detection and text analysis
def DisplayBlockInformation(block):
    print('Id: {}'.format(block['Id']))
    if 'Text' in block:
        print('    Detected: ' + block['Text'])
    print('    Type: ' + block['BlockType'])
   
    if 'Confidence' in block:
        print('    Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

    if block['BlockType'] == 'CELL':
        print("    Cell information")
        print("        Column:" + str(block['ColumnIndex']))
        print("        Row:" + str(block['RowIndex']))
        print("        Column Span:" + str(block['ColumnSpan']))
        print("        RowSpan:" + str(block['ColumnSpan']))    
    
    if 'Relationships' in block:
        print('    Relationships: {}'.format(block['Relationships']))
    print('    Geometry: ')
    print('        Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
    print('        Polygon: {}'.format(block['Geometry']['Polygon']))
    
    if block['BlockType'] == "KEY_VALUE_SET":
        print ('    Entity Type: ' + block['EntityTypes'][0])
    if 'Page' in block:
        print('Page: ' + block['Page'])
    print()

def process_text_detection(image_path):
    image = Image.open(image_path)
    # Detect text in the document
    #process using image bytes                      
    #image_binary = stream.getvalue()
    #response = client.detect_document_text(Document={'Bytes': image_binary})
    
    image_bytes = None
    with open(image_path, 'rb') as document:
        image_bytes = bytearray(document.read())
    response = textract.analyze_document(Document={'Bytes': image_bytes}, FeatureTypes=["FORMS"])

    #Get the text blocks
    blocks=response['Blocks']
    width, height =image.size  
    draw = ImageDraw.Draw(image)  
    print ('Detected Document Text')
   
    # Create image showing bounding box/polygon the detected lines/text
    for block in blocks:
            print('Type: ' + block['BlockType'])
            if block['BlockType'] != 'PAGE':
                print('Detected: ' + block['Text'])
                print('Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

            print('Id: {}'.format(block['Id']))
            if 'Relationships' in block:
                print('Relationships: {}'.format(block['Relationships']))
            print('Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
            print('Polygon: {}'.format(block['Geometry']['Polygon']))
            print()
            draw=ImageDraw.Draw(image)
            # Draw WORD - Green -  start of word, red - end of word
            if block['BlockType'] == "WORD":
                draw.line([(width * block['Geometry']['Polygon'][0]['X'],
                height * block['Geometry']['Polygon'][0]['Y']),
                (width * block['Geometry']['Polygon'][3]['X'],
                height * block['Geometry']['Polygon'][3]['Y'])],fill='green',
                width=2)
            
                draw.line([(width * block['Geometry']['Polygon'][1]['X'],
                height * block['Geometry']['Polygon'][1]['Y']),
                (width * block['Geometry']['Polygon'][2]['X'],
                height * block['Geometry']['Polygon'][2]['Y'])],
                fill='red',
                width=2)    

                 
            # Draw box around entire LINE  
            if block['BlockType'] == "LINE":
                points=[]

                for polygon in block['Geometry']['Polygon']:
                    points.append((width * polygon['X'], height * polygon['Y']))

                draw.polygon((points), outline='black')    
  
                # Uncomment to draw bounding box
                #box=block['Geometry']['BoundingBox']                    
                #left = width * box['Left']
                #top = height * box['Top']           
                #draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],outline='black') 


    # Display the image
    image.show()
    # display image for 10 seconds

    
    return len(blocks)


process_text_detection("../OneDrive_1_11-6-2019/iCard_021873_1_Daba_Ayehush_H.jpg")



Detected Document Text
Type: PAGE
Id: 7ed446ea-5778-4f80-ab79-140f2d5e924f
Relationships: [{'Type': 'CHILD', 'Ids': ['8719f92c-b65c-460c-8472-daa6de977241', 'e23eb4f0-36de-42ef-a1cd-00c6cf05bc3f', 'ae5a60f9-d104-44ab-94ee-fe1dd12bed76', '828ad79b-546f-4088-8c2d-10c2f883d322', '10a11182-3a6c-41d9-8447-b4059389c88b', 'c1b317d2-6cec-4596-bbd2-beb80bff04dd', '1f43550c-36db-42e2-ad47-73bde6821ec7', '02b96294-eda6-44fe-891a-27543ea93e6e', '7db16f9d-be05-4456-bca7-935c99160fea', '6e004d3f-74ef-4237-bb5d-0fc4dc1e54a8', '0ce00e8e-2645-4765-a267-8a0c537a7de0', '94d3f55b-64cb-4d3d-a699-1f73c956aed3', 'ba6cb010-5a03-46bd-a322-bc6d2e995e61', '06e25a08-4ca7-44ef-a89d-e310a0b1852b', '7d468cec-91e2-406e-834d-9794ae62a553', 'ab15bf4c-9963-438a-beca-1efba64e4cc5', 'f62498a9-cdce-45f6-8774-de7078350cd5', '1d247905-d943-4750-8cc1-c5e7b0624309', 'c7a26033-ecb1-4e23-bb0c-12b7f0b87b38', '357f8089-2faf-4647-9382-97efb3a0b3ba', '53a83da2-1608-4cd3-bd0a-0fec0da91be8', '286745ed-68e5-47e1-bfc1-3adce605e1eb', '0a

KeyError: 'Text'

In [3]:
import boto3
import os
import pandas as pd

# Amazon Textract client
textract = boto3.client(
    'textract',
    region_name='us-east-2',
    aws_access_key_id="aws_access_key_id", 
    aws_secret_access_key="aws_secret_access_key", 
)

#print(response)

all_text_detections = []
files = os.listdir("../OneDrive_1_11-6-2019")

i = 1
for filename in files:
    print(filename)
    if not os.path.isfile(f"../OneDrive_1_11-6-2019/{filename}"):
        continue
        
    if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        continue
    with open(f"../OneDrive_1_11-6-2019/{filename}", 'rb') as document:
        imageBytes = bytearray(document.read())
    # Call Amazon Textract
    response = textract.detect_document_text(Document={'Bytes': imageBytes})
    # Print detected text
    full_text = ""
    for item in response["Blocks"]:
        if item["BlockType"] == "LINE":
            full_text += f"{item['Text']}\n"
    text_detection = {
        "filename": filename,
        "text": full_text
    }
    print(f"{i} / {len(files)} Processed")
    i += 1
    all_text_detections.append(text_detection)
    

df = pd.DataFrame(all_text_detections)

df.to_csv("first_attempt_amazon_textract.csv")

iCard_021875_1_Daba_Shorro.jpg
1 / 123 Processed
iCard_021960_1.jpg
2 / 123 Processed
iCard_021912_1_Dahlberg_John.jpg
3 / 123 Processed
iCard_021894_1_Daehn_Herman_W.jpg
4 / 123 Processed


NameError: name 'pd' is not defined