# Can we use Claude to extract all the fields from a form

Based on the idea from here: 
https://github.com/co-cddo/form-extractor-prototype_testing/tree/main

Yes we can.

In [1]:
import base64
import io
import json
from ast import List
from pathlib import Path

import boto3
from anthropic import AnthropicBedrock
from anthropic.types.message import Message
from anthropic.types.tool_use_block import ToolUseBlock
from dotenv import load_dotenv
from pdf2image import convert_from_path
from PIL import Image


def encode_image(byte_array):
    """encode image for claude"""
    return (base64.b64encode(byte_array).decode("utf-8"),)


def pdf_to_image_bytes(pdf_path: Path, width: int = 600, dpi: int = 300):
    """
    Convert a multipage PDF to a list of bytes of JPEG images.

    :param pdf_path: str, path to the PDF file
    :param width: int, desired width of the output images (aspect ratio maintained)
    :param dpi: int, dots per inch for rendering the PDF
    :return: list of bytes, each element is a JPEG image
    """
    # Convert PDF to list of PIL Image objects
    images = convert_from_path(pdf_path, dpi=dpi)

    image_bytes_list = []

    for img in images:
        # Resize image if width is specified
        if width:
            ratio = width / float(img.width)
            height = int(ratio * img.height)
            img = img.resize((width, height), Image.LANCZOS)

        # Convert PIL Image to bytes
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format="JPEG")
        img_byte_arr = img_byte_arr.getvalue()

        image_bytes_list.append(img_byte_arr)

    return image_bytes_list


In [2]:
load_dotenv()
boto3.setup_default_session()
client = AnthropicBedrock()

base_prompt = """
Is this a form? Answer Yes or No. 
It's only a form if it contains form field boxes.
Hand drawn forms, questionnaires and surveys are all valid forms.
If it is a form, extract the questions from it using the extract_form_questions tool.
If there is no output, explain why.
"""

# Read the JSON string from the file
with open("extract-form-questions.json", "r") as file:
    json_string = file.read()

extraction_tool = json.loads(json_string)


def process_form(image_bytes: List) -> Message:
    assert isinstance(
        image_bytes, list
    ), f"Input must be a list, {type(image_bytes)} received"

    assert (
        len(image_bytes) < 20
    ), f"Can only process forms up to 20 pages, you gave {len(image_bytes)}"

    messages = [
        {
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/jpeg",
                "data": img,
            },
        }
        for img in image_bytes
    ] + [{"type": "text", "text": base_prompt}]

    response = client.messages.create(
        model="anthropic.claude-3-5-sonnet-20240620-v1:0",
        temperature=0.01,
        max_tokens=5000,
        tools=[extraction_tool],
        messages=[{"role": "user", "content": messages}],
    )
    return response


In [5]:
image_bytes = pdf_to_image_bytes("forms/not_a_form.pdf", width=800, dpi=300)
not_a_response = process_form(image_bytes)
not_a_response

Message(id='msg_bdrk_01EXkcKsLM56fk8m8ioYjvp7', content=[TextBlock(text='No, this is not a form. The images contain project documentation, planning details, and technical information about a prototype development project. There are no form field boxes or questions that would be typical in a form or questionnaire. \n\nThe content includes:\n\n1. Project purpose and overview\n2. User requirements and deliverables\n3. Project sizing and timeline\n4. Proposed planning with sprint details\n5. Technical tasks and considerations\n6. Team member responsibilities\n7. Diagrams of the project architecture\n8. Ideas backlog\n\nSince this is not a form, there are no form questions to extract using the extract_form_questions tool. The tool is designed for forms with specific questions and answer fields, which are not present in these project planning and documentation images.', type='text')], model='claude-3-5-sonnet-20240620', role='assistant', stop_reason='end_turn', stop_sequence=None, type='mess

In [6]:
image_bytes = pdf_to_image_bytes("forms/example_form.pdf", width=800, dpi=300)
a_response = process_form(image_bytes)
a_response


Message(id='msg_bdrk_017QWV4g99eDHjZRzEYMEw8g', content=[TextBlock(text="Yes, this is a form. The images show multiple pages of a legal claim form with various fields and boxes for entering information. I'll proceed to extract the questions using the extract_form_questions tool.", type='text'), ToolUseBlock(id='toolu_bdrk_01Gg6J4mEtrJBf2ZVosJHps2', input={'pages': [{'id': 1, 'question_text': 'Claimant(s) name(s) and address(es) including postcode', 'answer_type': 'address', 'answer_settings': {'input_type': 'uk_address'}}, {'id': 2, 'question_text': 'Defendant(s) name and address(es) including postcode', 'answer_type': 'address', 'answer_settings': {'input_type': 'uk_address'}}, {'id': 3, 'question_text': 'Brief details of claim', 'answer_type': 'text'}, {'id': 4, 'question_text': 'Value', 'answer_type': 'text'}, {'id': 5, 'question_text': 'Amount claimed', 'answer_type': 'number'}, {'id': 6, 'question_text': 'Court fee', 'answer_type': 'number'}, {'id': 7, 'question_text': "Legal repr

In [29]:
def contains_tool_response(response):
    """determine if the tool was used"""
    return (
        isinstance(response.content[1], ToolUseBlock)
        if len(response.content) > 1
        else False
    )


def check_tool_response(tool_block):
    """Determine if the tool block is valid if it has a pages field"""
    return True if tool_block.input.get("pages", False) else False


check_tool_response(a_response.content[1])

True