In [12]:
!pip install boto3 pdf2image




In [43]:
import boto3
import json
import base64
from pdf2image import convert_from_path
import io

# AWS Bedrock client
AWS_REGION = os.getenv("AWS_REGION", "us-east-1")  # Made more flexible
bedrock_config = Config(
    connect_timeout=10, read_timeout=300, retries={"max_attempts": 3}
)
bedrock_client = boto3.client(
    "bedrock-runtime", region_name=AWS_REGION, config=bedrock_config
)
logger.info(f"Using AWS Region: {AWS_REGION}")


def query_claude_with_pdf_image(pdf_path, prompt, model_id="us.anthropic.claude-3-7-sonnet-20250219-v1:0", max_tokens=4096, page_number=1, pdf_password=None):
    """
    Send PDF page images to Claude Sonnet via AWS Bedrock and get the response
    
    Args:
        pdf_path (str): Path to the PDF file
        prompt (str): Instructions for information extraction
        model_id (str): Bedrock model ID for Claude Sonnet
        max_tokens (int): Maximum number of tokens in the response
        
    Returns:
        str: Claude's response with extracted information
    """

    # Convert PDF pages to images (with password support)
    try:
        if pdf_password:
            logger.info("Converting password-protected PDF to images")
            images = convert_from_path(pdf_path, userpw=pdf_password)
        else:
            logger.info("Converting PDF to images")
            images = convert_from_path(pdf_path)
    except Exception as e:
        if "incorrect password" in str(e).lower() or "password" in str(e).lower():
            raise ValueError(f"Failed to open PDF: Incorrect password or PDF requires a password")
        else:
            raise ValueError(f"Failed to convert PDF to images: {str(e)}")

    total_pages = len(images)
        
    if total_pages == 0:
        raise ValueError("No pages found in PDF")
        
    # Prepare content blocks for the API request
    content_blocks = [{"type": "text", "text": prompt}]
        
    # Handle different page selection options
    if page_number == 0:
        # Process all pages
        selected_images = images
    elif page_number == -1:
        # Process last page
        selected_images = [images[-1]]
    elif 1 <= page_number <= total_pages:
        # Process specific page (1-indexed)
        selected_images = [images[page_number - 1]]
    else:
        raise ValueError(f"Invalid page_number: {page_number}. PDF has {total_pages} pages. Use 1-{total_pages}, 0 for all pages, or -1 for last page.")
        
    # Add selected page(s) as image(s)
    for i, image in enumerate(selected_images):
        base64_image = encode_image_to_base64(image)
        content_blocks.append({
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/png",
                "data": base64_image
            }
        })
        
    request_body = {
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": max_tokens,
            "messages": [
                {
                    "role": "user",
                    "content": content_blocks
                }
            ]
        }
    
    response = bedrock_client.invoke_model(
        modelId=model_id,
        contentType="application/json",
        accept="application/json",
        body=json.dumps(request_body),  # No need to encode to utf-8 here, SDK handles it
    )
    
    # Parse and return the response
    response_body = json.loads(response['body'].read().decode('utf-8'))
    return response_body['content'][0]['text']

2025-07-25 14:33:47,573 [INFO] webapp_html_benchmark: Using AWS Region: us-east-1


In [34]:
# Example usage
pdf_path = "data/BAR4-373351589-7931-37145-13171184-6767.pdf"
prompt = "Please extract the text in the first image"

result = query_claude_with_pdf_image(pdf_path, prompt, page_number=1)
print(result)

Here's the text from the image:

APPLICATION FORM FOR FINANCIAL INSTITUTIONS WHICH ARE SECOND LEVEL BANKS

Acupay Reference Number: 373351589-7931-37145

IMPORTANT: Immediate Action Required...

1. Print out the Application Form for Use by Financial Institutions Which are Second Level Banks (below).
2. Review the form for accuracy and sign at the X.
3. Send the signed the form to Monte Titoli electronically via the Acupay System using one of the following means:
   • FAX to +39 23 600 5560 or +1 646-383-9489 or +44 20 7067 8453
   • OR make a scanned PDF of the signed documentation into one single file and email that file to certify@acupay.com
   • Please send only once (via email or fax but not both)

4. Securely mail/courier the paper original of the signed form for receipt by Acupay in London (at the address below) on or before the 10th day of the month following the month originally signed:
   • Monte Titoli S.p.A. c/o Acupay System LLC
     Unit 3, 1st Floor Cosmopolitan House
   

# 3. Example with hand written text and tabular format

In [36]:
# Example usage
pdf_path = "data/BNYM_DOC_Y11384.pdf"
prompt = "Please extract all the text (INCLUDING HAND WRITTEN ONE) and try to respect the table structure"

result = query_claude_with_pdf_image(pdf_path, prompt, page_number=1)
print(result)

Here's the extracted text from the image, maintaining the table structure:

```
FORM 1                                1004191            G303964
                                                         4/30/2025
                                                         V11384

ITALIAN DIVIDENDS CERTIFICATION FOR RELIEF AT SOURCE
CERTIFICAZIONE PER RITENUTA RIDOTTA SU DIVIDENDI ITALIANI
THE PRESENT FORM IS A REQUEST TO BENEFIT FROM THE TREATY RATE ON ITALIAN SOURCE DIVIDENDS
IL PRESENTE DOCUMENTO COSTITUISCE RICHIESTA AI FINI DELL'OTTENIMENTO DEI BENEFICI CONVENZIONALI SUI DIVIDENDI ITALIANI

Part I | DECLARATION OF THE BENEFICIARY
Parte I | DICHIARAZIONE DEL BENEFICIARIO
Section A | IDENTIFICATION OF THE BENEFICIAL OWNER
Sezione A | IDENTIFICAZIONE DEL BENEFICIARIO EFFETTIVO
Name of Beneficial Owner / Nome/Ragione sociale del Beneficiario | Internal Reference¹ / Riferimento Interno
ONTARIO POWER GENERATION INC.
Tax ID or other ID number² / Codice Fiscale o altro codice identificativo | 

In [45]:
# Example usage
pdf_path = "data/BNYM_DOC_Z094123.pdf"
prompt = "Please extract all the text (INCLUDING HAND WRITTEN ONE) and try to respect the table structure"

result = query_claude_with_pdf_image(pdf_path, prompt, page_number=1, pdf_password='Acupay2025@')
print(result)


2025-07-25 14:38:39,819 [INFO] webapp_html_benchmark: Converting password-protected PDF to images


# CERTIFICATE CONTENT

Z084123

ROYAUME DE BELGIQUE | KONINKRIJK BELGIË | KÖNIGREICH BELGIEN | KINGDOM OF BELGIUM
Service Public Fédéral | Federale Overheidsdienst | Föderaler Öffentlichen Dienst | Federal Public Service
FINANCES | FINANCIEN | FINANZEN | FINANCE
Administration générale | Algemene Administratie | Generalverwaltung | General Administration
de la FISCALITÉ | van de FISCALITEIT | des STEUERWESENS | of TAXATION

ATTESTATION (1) | GETUIGSCHRIFT (1) | BESCHEINIGUNG (1) | CERTIFICATE (1)

1
Exemplaire destiné à
l'Administration ...........................
Exemplaar voor de
................................. administratie
Exemplar für die
................................. Verwaltung
Copy for the
Italian | authorities

délivrée aux résidents de la Belgique aux fins d'application
des conventions préventives de la double imposition.

afgeleverd aan inwoners van België met het oog op
de toepassing van dubbelbelastingverdragen.

für in Belgien ansässige Personen im Hinblick auf
die A

# 3. Example with ink stamp

In [47]:
# Example usage
pdf_path = "data/BNYM_DOC_Z095357.pdf"
prompt = "Please extract all the text (INCLUDING HAND WRITTEN ONE) and try to respect the table structure"

result = query_claude_with_pdf_image(pdf_path, prompt, page_number=1, pdf_password='Acupay2025@')
print(result)

2025-07-25 14:42:12,651 [INFO] webapp_html_benchmark: Converting password-protected PDF to images


# Tax Residency Certification: to be provided by the investor's local tax authority
2022 (From 01/01 --> )

## CERTIFICATION OF THE INVESTOR'S TAX AUTHORITIES OF THE COUNTRY OF RESIDENCE

*CERTIFICAZIONE DELL'AMMINISTRAZIONE FISCALE DEL PAESE DI RESIDENZA*

WE CERTIFY THAT THE BENEFICIARY / *SI CERTIFICA CHE IL BENEFICIARIO*:
Kapitalforeningen ATP Invest, Emerging Markets Obligationer
Kalvebod Brygge 1
1560 Copenhagen V
42 64 05 65 (Tax ID)

TO THE BEST THIS ADMINISTRATION'S KNOWLEDGE
*PER QUANTO RISULTA A QUESTA AMMINISTRAZIONE*

IS RESIDENT IN Denmark IN PURSUANCE OF ART. 4 OF THE CONVENTION IN FORCE
*E' RESIDENTE IN Danimarca AI SENSI DELL'ART. 4 DELLA CONVENZIONE IN VIGORE*

BETWEEN ITALY AND Denmark
*TRA L'ITALIA E Danimarca*

AND THAT IT
*E CHE*

[✓] IS
[✓] *E' SOGGETTO*

[ ] IS NOT
[ ] *NON E' SOGGETTO*

SUBJECT TO THE CORPORATE TAX IN ITS COUNTRY OF RESIDENCE
*ALL'IMPOSTA SUL REDDITO DELLE SOCIETA' NEL PROPRIO PAESE DI RESIDENZA*

DATE ........................
*DATA*

STAMP OF 