In [None]:
!python -m pip install amazon-textract-caller --upgrade
!python -m pip install amazon-textract-response-parser --upgrade 
!python -m pip install tabulate --upgrade
!python -m pip install PyPDF4 --upgrade

In [None]:
!python -m pip install PyPDF2 pycryptodome

In [None]:
import os
from PyPDF2 import PdfReader
import boto3
import time
import json
import tabulate
mySession = boto3.session.Session()
awsRegion = mySession.region_name

# Amazon Textract client
textract = boto3.client('textract')
s3 = boto3.client("s3")
bedrock_runtime = boto3.client(
      service_name="bedrock-runtime",
      region_name="us-west-2"
)


In [None]:
def start_job(client, s3_bucket_name, object_name):
    response = None
    response = client.start_document_text_detection(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3_bucket_name,
                'Name': object_name
            }})

    return response["JobId"]

def is_job_complete(client, job_id):
    time.sleep(1)
    response = client.get_document_text_detection(JobId=job_id)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(30)
        response = client.get_document_text_detection(JobId=job_id)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status

def get_job_results(client, job_id):
    pages = []
    time.sleep(1)
    response = client.get_document_text_detection(JobId=job_id)
    pages.append(response)
    print("Resultset page received: {}".format(len(pages)))
    next_token = None
    if 'NextToken' in response:
        next_token = response['NextToken']

    while next_token:
        time.sleep(1)
        response = client.get_document_text_detection(JobId=job_id, NextToken=next_token)
        pages.append(response)
        print("Resultset page received: {}".format(len(pages)))
        next_token = None
        if 'NextToken' in response:
            next_token = response['NextToken']

    return pages

def save_text_to_s3(text, s3_bucket_name, object_name):
    s3_client = boto3.client('s3')
    response = s3_client.put_object(
        Bucket=s3_bucket_name,
        Key=object_name + '.txt',
        Body=text,
        ContentType='text/plain'
    )
    print(f"Saved text to S3: {object_name}.txt")

if __name__ == "__main__":
    # Document
    s3_bucket_name = "msa-textract"
    region = "us-west-2"
    client = boto3.client('textract', region_name=region)

    # List objects in the S3 bucket
    s3 = boto3.client('s3', region_name=region)
    objects = s3.list_objects(Bucket=s3_bucket_name)

    for object_info in objects.get("Contents", []):
        document_name = object_info["Key"]
        job_id = start_job(client, s3_bucket_name, document_name)
        print("Started job for document '{}' with id: {}".format(document_name, job_id))
        if is_job_complete(client, job_id):
            response = get_job_results(client, job_id)

        # Extract and save text
        extracted_text = ""
        for result_page in response:
            for item in result_page["Blocks"]:
                if item["BlockType"] == "LINE":
                    extracted_text += item["Text"] + '\n'
                    
        # Save the extracted text to S3 with a .txt extension
        # save_text_to_s3(extracted_text, s3_bucket_name, document_name)
        prompt = "please summarize and tell me when does this agreement expire and if there is any discount there\n" + extracted_text.encode('unicode_escape').decode('utf-8')
        kwargs = {
            "modelId": "anthropic.claude-v2",
            "contentType": "application/json",
            "accept": "*/*",
            "body": json.dumps({
                "prompt": f"Human: {prompt}\nAssistant:",
                "max_tokens_to_sample": 300,
                "temperature": 1,
                "top_k": 250,
                "top_p": 0.999,
                "stop_sequences": ["\n\nHuman:"],
                "anthropic_version": "bedrock-2023-05-31"
            })
        }
        response = bedrock_runtime.invoke_model_with_response_stream(**kwargs)

        stream = response.get('body')
        if stream:
            for event in stream:
                chunk=event.get('chunk')
                if chunk:
                    print(json.loads(chunk.get('bytes')).get('completion'), end="")


