In [None]:
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: Apache-2.0

"""
Purpose
Shows how to use the AWS SDK for Python (Boto3) with Amazon Textract to
detect text, form, and table elements in document images.
"""

import json
import logging
from botocore.exceptions import ClientError
import boto3
import pandas as pd

logger = logging.getLogger(__name__)


# snippet-start:[python.example_code.textract.TextractWrapper]
class TextractWrapper:
    """Encapsulates Textract functions."""
    def __init__(self, textract_client, s3_resource, sqs_resource):
        """
        :param textract_client: A Boto3 Textract client.
        :param s3_resource: A Boto3 Amazon S3 resource.
        :param sqs_resource: A Boto3 Amazon SQS resource.
        """
        self.textract_client = textract_client
        self.s3_resource = s3_resource
        self.sqs_resource = sqs_resource
# snippet-end:[python.example_code.textract.TextractWrapper]

# snippet-start:[python.example_code.textract.DetectDocumentText]
    def detect_file_text(self, *, document_file_name=None, document_bytes=None):
        """
        Detects text elements in a local image file or from in-memory byte data.
        The image must be in PNG or JPG format.
        :param document_file_name: The name of a document image file.
        :param document_bytes: In-memory byte data of a document image.
        :return: The response from Amazon Textract, including a list of blocks
                 that describe elements detected in the image.
        """
        if document_file_name is not None:
            with open(document_file_name, 'rb') as document_file:
                document_bytes = document_file.read()
        try:
            response = self.textract_client.detect_document_text(
                Document={'Bytes': document_bytes})
            logger.info(
                "Detected %s blocks.", len(response['Blocks']))
        except ClientError:
            logger.exception("Couldn't detect text.")
            raise
        else:
            return response
# snippet-end:[python.example_code.textract.DetectDocumentText]

# snippet-start:[python.example_code.textract.AnalyzeDocument]
    def analyze_file(
            self, feature_types, *, document_file_name=None, document_bytes=None):
        """
        Detects text and additional elements, such as forms or tables, in a local image
        file or from in-memory byte data.
        The image must be in PNG or JPG format.
        :param feature_types: The types of additional document features to detect.
        :param document_file_name: The name of a document image file.
        :param document_bytes: In-memory byte data of a document image.
        :return: The response from Amazon Textract, including a list of blocks
                 that describe elements detected in the image.
        """
        if document_file_name is not None:
            with open(document_file_name, 'rb') as document_file:
                document_bytes = document_file.read()
        try:
            response = self.textract_client.analyze_document(
                Document={'Bytes': document_bytes}, FeatureTypes=feature_types)
            logger.info(
                "Detected %s blocks.", len(response['Blocks']))
        except ClientError:
            logger.exception("Couldn't detect text.")
            raise
        else:
            return response
# snippet-end:[python.example_code.textract.AnalyzeDocument]

# snippet-start:[python.example_code.textract.helper.prepare_job]
    def prepare_job(self, bucket_name, document_name, document_bytes):
        """
        Prepares a document image for an asynchronous detection job by uploading
        the image bytes to an Amazon S3 bucket. Amazon Textract must have permission
        to read from the bucket to process the image.
        :param bucket_name: The name of the Amazon S3 bucket.
        :param document_name: The name of the image stored in Amazon S3.
        :param document_bytes: The image as byte data.
        """
        try:
            bucket = self.s3_resource.Bucket(bucket_name)
            bucket.upload_fileobj(document_bytes, document_name)
            logger.info("Uploaded %s to %s.", document_name, bucket_name)
        except ClientError:
            logger.exception("Couldn't upload %s to %s.", document_name, bucket_name)
            raise
# snippet-end:[python.example_code.textract.helper.prepare_job]

# snippet-start:[python.example_code.textract.helper.check_job_queue]
    def check_job_queue(self, queue_url, job_id):
        """
        Polls an Amazon SQS queue for messages that indicate a specified Textract
        job has completed.
        :param queue_url: The URL of the Amazon SQS queue to poll.
        :param job_id: The ID of the Textract job.
        :return: The status of the job.
        """
        status = None
        try:
            queue = self.sqs_resource.Queue(queue_url)
            messages = queue.receive_messages()
            if messages:
                msg_body = json.loads(messages[0].body)
                msg = json.loads(msg_body['Message'])
                if msg.get('JobId') == job_id:
                    messages[0].delete()
                    status = msg.get('Status')
                    logger.info(
                        "Got message %s with status %s.", messages[0].message_id,
                        status)
            else:
                logger.info("No messages in queue %s.", queue_url)
        except ClientError:
            logger.exception("Couldn't get messages from queue %s.", queue_url)
        else:
            return status
# snippet-end:[python.example_code.textract.helper.check_job_queue]

# snippet-start:[python.example_code.textract.StartDocumentTextDetection]
    def start_detection_job(
            self, bucket_name, document_file_name, sns_topic_arn, sns_role_arn):
        """
        Starts an asynchronous job to detect text elements in an image stored in an
        Amazon S3 bucket. Textract publishes a notification to the specified Amazon SNS
        topic when the job completes.
        The image must be in PNG, JPG, or PDF format.
        :param bucket_name: The name of the Amazon S3 bucket that contains the image.
        :param document_file_name: The name of the document image stored in Amazon S3.
        :param sns_topic_arn: The Amazon Resource Name (ARN) of an Amazon SNS topic
                              where the job completion notification is published.
        :param sns_role_arn: The ARN of an AWS Identity and Access Management (IAM)
                             role that can be assumed by Textract and grants permission
                             to publish to the Amazon SNS topic.
        :return: The ID of the job.
        """
        try:
            response = self.textract_client.start_document_text_detection(
                DocumentLocation={
                    'S3Object': {'Bucket': bucket_name, 'Name': document_file_name}},
                NotificationChannel={
                    'SNSTopicArn': sns_topic_arn, 'RoleArn': sns_role_arn})
            job_id = response['JobId']
            logger.info(
                "Started text detection job %s on %s.", job_id, document_file_name)
        except ClientError:
            logger.exception("Couldn't detect text in %s.", document_file_name)
            raise
        else:
            return job_id
# snippet-end:[python.example_code.textract.StartDocumentTextDetection]

# snippet-start:[python.example_code.textract.GetDocumentTextDetection]
    def get_detection_job(self, job_id):
        """
        Gets data for a previously started text detection job.
        :param job_id: The ID of the job to retrieve.
        :return: The job data, including a list of blocks that describe elements
                 detected in the image.
        """
        try:
            response = self.textract_client.get_document_text_detection(
                JobId=job_id)
            job_status = response['JobStatus']
            logger.info("Job %s status is %s.", job_id, job_status)
        except ClientError:
            logger.exception("Couldn't get data for job %s.", job_id)
            raise
        else:
            return response
# snippet-end:[python.example_code.textract.GetDocumentTextDetection]

# snippet-start:[python.example_code.textract.StartDocumentAnalysis]
    def start_analysis_job(
            self, bucket_name, document_file_name, feature_types, sns_topic_arn,
            sns_role_arn):
        """
        Starts an asynchronous job to detect text and additional elements, such as
        forms or tables, in an image stored in an Amazon S3 bucket. Textract publishes
        a notification to the specified Amazon SNS topic when the job completes.
        The image must be in PNG, JPG, or PDF format.
        :param bucket_name: The name of the Amazon S3 bucket that contains the image.
        :param document_file_name: The name of the document image stored in Amazon S3.
        :param feature_types: The types of additional document features to detect.
        :param sns_topic_arn: The Amazon Resource Name (ARN) of an Amazon SNS topic
                              where job completion notification is published.
        :param sns_role_arn: The ARN of an AWS Identity and Access Management (IAM)
                             role that can be assumed by Textract and grants permission
                             to publish to the Amazon SNS topic.
        :return: The ID of the job.
        """
        try:
            response = self.textract_client.start_document_analysis(
                DocumentLocation={
                    'S3Object': {'Bucket': bucket_name, 'Name': document_file_name}},
                NotificationChannel={
                    'SNSTopicArn': sns_topic_arn, 'RoleArn': sns_role_arn},
                FeatureTypes=feature_types)
            job_id = response['JobId']
            logger.info(
                "Started text analysis job %s on %s.", job_id, document_file_name)
        except ClientError:
            logger.exception("Couldn't analyze text in %s.", document_file_name)
            raise
        else:
            return job_id
# snippet-end:[python.example_code.textract.StartDocumentAnalysis]

# snippet-start:[python.example_code.textract.GetDocumentAnalysis]
    def get_analysis_job(self, job_id):
        """
        Gets data for a previously started detection job that includes additional
        elements.
        :param job_id: The ID of the job to retrieve.
        :return: The job data, including a list of blocks that describe elements
                 detected in the image.
        """
        try:
            response = self.textract_client.get_document_analysis(
                JobId=job_id)
            job_status = response['JobStatus']
            logger.info("Job %s status is %s.", job_id, job_status)
        except ClientError:
            logger.exception("Couldn't get data for job %s.", job_id)
            raise
        else:
            return response
# snippet-end:[python.example_code.textract.GetDocumentAnalysis]

In [None]:
import boto3

s3 = boto3.client("s3",
         aws_access_key_id='PLACEHOLDER',
         aws_secret_access_key= 'PLACEHOLDER')
textract = boto3.client("textract", region_name='us-east-1')

filename = "0a57f7837a091ce523ddbca495a38198.pdf"
file_path = "test_files/0a57f7837a091ce523ddbca495a38198.pdf"
bucket = "vipocr"

s3.upload_file(file_path, bucket, filename)

doc_spec = {"S3Object": {"Bucket": bucket, "Name": filename}}

# response = textract.start_document_analysis(DocumentLocation=doc_spec, FeatureTypes=["FORMS"])
response = textract.start_document_text_detection(DocumentLocation=doc_spec)
print(response["JobId"])

In [None]:
import time

def poll_textract_job(
    job_id: str,
    initial_delay: float = 10,
    poll_interval: float = 2.5,
    max_attempts: int = 50,
) -> dict:
    """Poll for completed results for a given Textract job."""

    time.sleep(initial_delay)
    attempt = 0
    job_status = None

    while attempt < max_attempts:
        # response = textract.get_document_analysis(JobId=job_id)
        response = textract.get_document_text_detection(JobId=job_id)
        job_status = response["JobStatus"]

        if job_status != "IN_PROGRESS":
            break

        time.sleep(poll_interval)  # Remember that `get` attempts are throttled.
        attempt += 1

    return job_status

def get_textract_results(job_id):
    # response = textract.get_document_analysis(JobId=job_id)
    response = textract.get_document_text_detection(JobId=job_id)
    pages = [response]

    while "NextToken" in response:
        time.sleep(0.25)

        # response = textract.get_document_analysis(JobId=job_id, NextToken=response["NextToken"])
        response = textract.get_document_text_detection(JobId=job_id, NextToken=response["NextToken"])

        pages.append(response)

    return pages


job_status = poll_textract_job(response["JobId"])

if job_status == "SUCCEEDED":
    pages = get_textract_results(response["JobId"])
    print(f"Pages: {len(pages)}\nBlocks: {sum([len(p['Blocks']) for p in pages])}")


In [None]:
blocks = {block["Id"]: block for page in pages for block in page["Blocks"]}

In [None]:
print(blocks)

In [None]:
for page in pages:
    for block in page['Blocks']:
        if block['BlockType'] == 'LINE':
            try:
                print(block['Text'])
            except Exception as e:
                pass

In [None]:
ocr_df = pd.read_excel('ocr_df.xlsx', index_col=0)
ocr_df['texttract'] = ''
ocr_df

In [None]:
for index, row in ocr_df.iterrows():
    s3 = boto3.client("s3",
             aws_access_key_id='AKIAWICOLWXN36GCBIWC',
             aws_secret_access_key= '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')
    textract = boto3.client("textract", region_name='us-east-1')

    filename = f"{row['item_filename']}.pdf"
    file_path = f"test_files/{row['item_filename']}.pdf"
    bucket = "vipocr"

    s3.upload_file(file_path, bucket, filename)

    doc_spec = {"S3Object": {"Bucket": bucket, "Name": filename}}

    response = textract.start_document_text_detection(DocumentLocation=doc_spec)

    job_status = poll_textract_job(response["JobId"])
    print(index)
    if job_status == "SUCCEEDED":
        pages = get_textract_results(response["JobId"])
        pdf_text = ''

        for page in pages:
            for block in page['Blocks']:
                if block['BlockType'] == 'LINE':
                    try:
                        pdf_text = pdf_text + "\n" + str(block['Text'])
                    except Exception as e:
                        pass
        ocr_df.at[index, 'texttract'] = pdf_text
    else:
        pdf_text = ''


In [None]:
ocr_df.to_excel('temp_df.xlsx')

In [None]:
ocr_df