In [55]:
import PyPDF2 
import boto3  
import os  
import json

textract = boto3.client('textract')
s3 = boto3.client('s3')

def download_s3_file(s3_uri, local_path):  

    # Split the S3 URI to get bucket and key  
    s3_components = s3_uri.split('/')  
    bucket = s3_components[2]
    key = '/'.join(s3_components[3:])  
    filename = key.split("/")[-1]
    print(bucket, key)
    # Create S3 client  

    # Download file from S3 to local folder  
    s3.download_file(bucket, key, f"{local_path}/{filename}")
    return f"{local_path}/{filename}"


def extract_text(file):
    with open(file, 'rb') as pdfFileObj:
        pdfReader = PyPDF2.PdfReader(pdfFileObj) 
        count = 0
        text = []
        for pageObj in pdfReader.pages:
            count +=1
            text.append(pageObj.extract_text())

    return {"num_pages":count, "pages": text}


def upload_json_to_s3(data, s3_uri):  

  # Parse S3 URI to get bucket and key  
  s3_components = s3_uri.split('/')  
  bucket = s3_components[2]  
  key = '/'.join(s3_components[3:])  

  # Convert Python object to JSON string  
  json_data = json.dumps(data)  

  # Create S3 client  
  s3 = boto3.client('s3')  

  # Upload JSON string as a file to S3  
  s3.put_object(Body=json_data, Bucket=bucket, Key=key)


def amazon_textract_less_5mb(s3_uri):

    bucket = s3_uri.split('/')[2]
    key =  '/'.join(s3_uri.split('/')[3:])

    print(bucket, key)

    response = s3.head_object(Bucket=bucket, Key=key)
    size = response['ContentLength'] 

    if size > 5000000:
        print("Object is greater than 5MB")
    else:
        print("Object is less than or equal to 5MB")

    response = textract.detect_document_text(
        Document={
            'S3Object': {
                'Bucket': bucket,
                'Name': key
            }
        })
    n_pages = response['DocumentMetadata']['Pages']
    pages = {}

    text = ''
    for item in response['Blocks']:
        item_id = item['Id']
        if item['BlockType'] == 'PAGE':   
            relationships = item['Relationships']
            for relationship in relationships:
                if relationship['Type'] == 'CHILD':
                    pages[item_id] = {'CHILDS': relationship['Ids'], 'LINES': []}

        if item['BlockType'] == 'LINE':
            for page in pages.keys():
                if item_id in pages[page]['CHILDS']:
                    pages[page]['LINES'].append(item['Text'])


    return {"num_pages":n_pages, "pages": ['\n'.join(pages[key]['LINES']) for key in pages.keys()]}

In [34]:
s3_uri = "s3://ac-genai-streamlitbucket2fe9c216-4t8poszaf1to/carta para tasacion.pdf"
local_path ="./"

In [22]:
local_file = download_s3_file(s3_uri,local_path)

ac-genai-streamlitbucket2fe9c216-4t8poszaf1to doc.pdf


In [23]:
text = extract_text(local_file)

In [26]:
upload_json_to_s3(text, f"{s3_uri}.json")

In [56]:
res = amazon_textract_less_5mb(s3_uri)

ac-genai-streamlitbucket2fe9c216-4t8poszaf1to carta para tasacion.pdf
Object is less than or equal to 5MB


In [57]:
bucket = s3_uri.split('/')[2]   
key =  '/'.join(s3_uri.split('/')[3:])
response = s3.head_object(Bucket=bucket, Key=key)

In [58]:
response

{'ResponseMetadata': {'RequestId': 'ADCNR2P0XXPSYW85',
  'HostId': 'PcB2b5TlIeCTx7fsv6+PEjPKiOp1ffUcqvrSxlMwSka516Ea6Foe3/r745gi2N8m9QzRu18ZBbDxZT6vp5Jq/A==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'PcB2b5TlIeCTx7fsv6+PEjPKiOp1ffUcqvrSxlMwSka516Ea6Foe3/r745gi2N8m9QzRu18ZBbDxZT6vp5Jq/A==',
   'x-amz-request-id': 'ADCNR2P0XXPSYW85',
   'date': 'Thu, 26 Oct 2023 13:14:05 GMT',
   'last-modified': 'Thu, 26 Oct 2023 02:17:58 GMT',
   'etag': '"615ea55ac5895a38e304403cbd516b27"',
   'x-amz-server-side-encryption': 'AES256',
   'x-amz-version-id': 'Ml.fntvCs2DZ421uBnvjSpc2GM8vvIn9',
   'accept-ranges': 'bytes',
   'content-type': 'application/pdf',
   'server': 'AmazonS3',
   'content-length': '566873'},
  'RetryAttempts': 0},
 'AcceptRanges': 'bytes',
 'LastModified': datetime.datetime(2023, 10, 26, 2, 17, 58, tzinfo=tzutc()),
 'ContentLength': 566873,
 'ETag': '"615ea55ac5895a38e304403cbd516b27"',
 'VersionId': 'Ml.fntvCs2DZ421uBnvjSpc2GM8vvIn9',
 'ContentType': 'applicati