In [2]:
import json
import base64
import fitz
from pathlib import Path
from PIL import Image
import boto3
import os
from dotenv import load_dotenv

In [3]:
load_dotenv()

aws_bucket = os.getenv("AWS_BUCKET")
aws_rol_arn = os.getenv("AWS_ROLE_ARN")

In [None]:
import json, base64
from pathlib import Path
import fitz
from PIL import Image
import boto3

def render_page_to_png(doc, i: int, dpi=200) -> Path:
    page = doc[i]
    pix = page.get_pixmap(dpi=dpi)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    img_path = Path(f"tmp_page_{i:05d}.png")
    img.save(img_path, "PNG")
    return img_path

def upload_to_s3(s3, bucket: str, key: str, local_path: Path):
    s3.upload_file(str(local_path), bucket, key)

def generate_batch_jsonl_nova_images_s3(pdf_path: str, output_jsonl: str,
                                       bucket: str, prefix: str,
                                       account_id: str, dpi=200):
    s3 = boto3.client("s3")
    doc = fitz.open(pdf_path)

    nombre_pdf = Path(pdf_path).stem

    with open(output_jsonl, "wa", encoding="utf-8") as f:
        for i in range(len(doc)):
            img_path = render_page_to_png(doc, i, dpi=dpi)

            s3_key = f"{prefix}/page_{nombre_pdf}_{i+1:05d}.png"
            upload_to_s3(s3, bucket, s3_key, img_path)

            record = {
                "recordId": str(i+1),
                "modelInput": {
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {"text": "Extrae todo el texto visible de la p√°gina."},
                                {
                                    "image": {
                                        "format": "png",
                                        "source": {
                                            "s3Location": {
                                                "uri": f"s3://{bucket}/{s3_key}",
                                                "bucketOwner": account_id
                                            }
                                        }
                                    }
                                }
                            ]
                        }
                    ]
                }
            }

            f.write(json.dumps(record, ensure_ascii=False) + "\n")
            img_path.unlink()

    doc.close()

In [28]:
ruta_proyecto = Path.cwd().parent

ruta_base = ruta_proyecto / "data/documents"

lista_archivos = [
    "EkosTextiles.pdf",
    "Ekosnegocios_TextilesSanPedro.pdf",
    "Extracto 11990032236 1990-06-07 297008.pdf",
    "documento_1765755270482.pdf",
    "IntroduccionAnalisisReal.pdf",
]

for archivo in lista_archivos:
    ruta_pdf = ruta_base / archivo

    generate_batch_jsonl_nova_images_s3(
        ruta_pdf,
        ruta_proyecto / "code/batch_input.jsonl",
        aws_bucket,
        "batch/test",
        aws_rol_arn.split(":")[4],
        dpi=200,
    )

In [29]:
s3 = boto3.client("s3")

s3.upload_file(
    str(ruta_proyecto / "code/batch_input.jsonl"),
    aws_bucket,
    "batch/test/batch_input_poc_3.jsonl"
)

In [47]:
bedrock = boto3.client("bedrock", region_name="us-east-1")

response = bedrock.create_model_invocation_job(
    jobName="nova-ocr-batch-ekos-6",
    modelId="amazon.nova-pro-v1:0",
    roleArn=aws_rol_arn,
    inputDataConfig={
        "s3InputDataConfig": {
            "s3Uri": f"s3://{aws_bucket}/batch/test/"
        }
    },
    outputDataConfig={
        "s3OutputDataConfig": {
            "s3Uri": f"s3://{aws_bucket}/batch/test/output/"
        }
    }
)

print("Job ARN:", response["jobArn"].split("/")[-1])

Job ARN: 6r490ebxh4z8


In [62]:
job_arn = response["jobArn"]

status = bedrock.get_model_invocation_job(
    jobIdentifier=job_arn
)

print(status["status"])

InProgress


In [60]:
s3 = boto3.client("s3")

obj = s3.get_object(
    Bucket=aws_bucket,
    Key=f"batch/test/output/{response['jobArn'].split('/')[-1]}/batch_input_poc_3.jsonl.out"
)

lines = obj["Body"].read().decode("utf-8").splitlines()

print(f"Total de registros: {len(lines)}")

Total de registros: 190


In [66]:
resultados = []

for line in lines[15:]:
    record = json.loads(line)

    # display(record)
    
    break