In [35]:
import json
import base64
import fitz
from pathlib import Path
from PIL import Image
import boto3
import os
from dotenv import load_dotenv

In [39]:
load_dotenv()

aws_bucket = os.getenv("AWS_BUCKET")
aws_rol_arn = os.getenv("AWS_ROLE_ARN")

In [25]:
def image_to_base64(path):
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")


def generate_batch_jsonl(pdf_path, output_jsonl):
    doc = fitz.open(pdf_path)
    requests = []

    with open(output_jsonl, "w", encoding="utf-8") as f:
        for i, page in enumerate(doc):
            text = page.get_text().strip()
            # if len(text) >= 20:
            #     continue  # página digital → no batch

            pix = page.get_pixmap(dpi=200)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            img_path = Path(f"tmp_page_{i}.png")
            img.save(img_path, "PNG")

            record = {
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"text": "Extrae todo el texto visible de la página"},
                            {
                                "image": {
                                    "format": "png",
                                    "bytes": image_to_base64(img_path),
                                }
                            },
                        ],
                    }
                ]
            }

            f.write(json.dumps(record, ensure_ascii=False) + "\n")
            img_path.unlink()

In [26]:
ruta_proyecto = Path.cwd().parent

ruta_base = ruta_proyecto / "data/documents"

lista_archivos = [
    "EkosTextiles.pdf",
    "Ekosnegocios_TextilesSanPedro.pdf",
    "Extracto 11990032236 1990-06-07 297008.pdf",
    "documento_1765755270482.pdf",
    "IntroduccionAnalisisReal.pdf"
]

for archivo in lista_archivos:
    ruta_pdf = ruta_base / archivo

    generate_batch_jsonl(ruta_pdf, ruta_proyecto / "code/batch_input.jsonl")

In [None]:
s3 = boto3.client("s3")

s3.upload_file(
    str(ruta_proyecto / "code/batch_input.jsonl"),
    aws_bucket,
    "code/bedrock-input/batch_input_poc_2.jsonl"
)

In [None]:
bedrock = boto3.client("bedrock", region_name="us-east-1")

response = bedrock.create_model_invocation_job(
    jobName="nova-ocr-batch-ekos-3",
    modelId="amazon.nova-pro-v1:0",
    roleArn=aws_rol_arn,
    inputDataConfig={
        "s3InputDataConfig": {
            "s3Uri": f"s3://{aws_bucket}/code/bedrock-input/batch_input_poc_2.jsonl"
        }
    },
    outputDataConfig={
        "s3OutputDataConfig": {
            "s3Uri": f"s3://{aws_bucket}/code/bedrock-output/"
        }
    }
)

print("Job ARN:", response["jobArn"].split("/")[-1])

In [30]:
job_arn = response["jobArn"]

status = bedrock.get_model_invocation_job(
    jobIdentifier=job_arn
)

print(status["status"])

Submitted


In [None]:
s3 = boto3.client("s3")

obj = s3.get_object(
    Bucket=aws_bucket,
    Key="code/bedrock-output/xf1ytm7y27ax/batch_input_poc_2.jsonl.out"
)

lines = obj["Body"].read().decode("utf-8").splitlines()

print(f"Total de registros: {len(lines)}")

Total de registros: 194


In [33]:
resultados = []

for line in lines:
    record = json.loads(line)
    
    break