# Can we use Claude to extract all the fields from a form

Based on the idea from here: 
https://github.com/co-cddo/form-extractor-prototype_testing/tree/main

Yes we can.

In [1]:
import base64
import io
import json
from ast import List
from pathlib import Path

import boto3
from anthropic import AnthropicBedrock
from anthropic.types.message import Message
from anthropic.types.tool_use_block import ToolUseBlock
from dotenv import load_dotenv
from pdf2image import convert_from_path
from PIL import Image


def encode_image(byte_array):
    """encode image for claude"""
    return base64.b64encode(byte_array).decode("utf-8")


def pdf_to_image_bytes(pdf_path: Path, width: int = 600, dpi: int = 300):
    """
    Convert a multipage PDF to a list of bytes of JPEG images.

    :param pdf_path: str, path to the PDF file
    :param width: int, desired width of the output images (aspect ratio maintained)
    :param dpi: int, dots per inch for rendering the PDF
    :return: list of bytes, each element is a JPEG image
    """
    # Convert PDF to list of PIL Image objects
    images = convert_from_path(pdf_path, dpi=dpi)

    image_bytes_list = []

    for img in images:
        # Resize image if width is specified
        if width:
            ratio = width / float(img.width)
            height = int(ratio * img.height)
            img = img.resize((width, height), Image.LANCZOS)

        # Convert PIL Image to bytes
        img_byte_arr = io.BytesIO()
        img.save(img_byte_arr, format="JPEG")
        img_byte_arr = img_byte_arr.getvalue()

        image_bytes_list.append(img_byte_arr)

    return image_bytes_list


In [2]:
load_dotenv()
boto3.setup_default_session()
client = AnthropicBedrock()

base_prompt = """
Is this a form? Answer Yes or No. 
It's only a form if it contains form field boxes.
Hand drawn forms, questionnaires and surveys are all valid forms.
If it is a form, extract the questions from it using the extract_form_questions tool.
If there is no output, explain why.
"""

# Read the JSON string from the file
with open("extract-form-questions.json", "r") as file:
    json_string = file.read()

extraction_tool = json.loads(json_string)


def process_form(image_bytes: List) -> Message:
    assert isinstance(
        image_bytes, list
    ), f"Input must be a list, {type(image_bytes)} received"

    assert (
        len(image_bytes) < 20
    ), f"Can only process forms up to 20 pages, you gave {len(image_bytes)}"

    messages = [
        {
            "type": "image",
            "source": {
                "type": "base64",
                "media_type": "image/jpeg",
                "data": encode_image(img),
            },
        }
        for img in image_bytes
    ] + [{"type": "text", "text": base_prompt}]

    response = client.messages.create(
        model="anthropic.claude-3-5-sonnet-20240620-v1:0",
        temperature=0.01,
        max_tokens=5000,
        tools=[extraction_tool],
        messages=[{"role": "user", "content": messages}],
    )
    return response


In [3]:
def contains_tool_response(response):
    """determine if the tool was used"""
    return (
        isinstance(response.content[1], ToolUseBlock)
        if len(response.content) > 1
        else False
    )


def check_tool_response(tool_block):
    """Determine if the tool block is valid if it has a pages field"""
    return True if tool_block.input.get("pages", False) else False




In [4]:
# image_bytes = pdf_to_image_bytes("sample_forms/not_a_form.pdf", width=800, dpi=300)
# not_a_response = process_form(image_bytes)
# # not_a_response

In [5]:
# image_bytes = pdf_to_image_bytes("sample_forms/example_form.pdf", width=800, dpi=300)
# a_response = process_form(image_bytes)
# a_response


## Processing a lot of forms


In [6]:
from pathlib import Path

def get_all_files(folder_path):
    folder = Path(folder_path)
    return [str(file) for file in folder.rglob('*') if file.is_file()]


folder_path = './forms_scrape'
all_files = get_all_files(folder_path)

pdf_files = [file for file in all_files if file.lower().endswith('.pdf')]

print(f"There are {len(all_files)} file and {len(pdf_files)} pdfs.")



There are 8300 file and 5943 pdfs.


In [7]:
import random 
sample_forms = random.sample(pdf_files, 10)

In [8]:
encoded_form = [pdf_to_image_bytes(form, width=800, dpi=300) for form in sample_forms]

In [9]:
processed_forms = {}

for img, file in zip(encoded_form, sample_forms):
    response = process_form(img)
    if not contains_tool_response(response):
        print("f{file} file was not a form.")
        break
    tool_response = response.content[1]
    if not check_tool_response(tool_response):
        print("f{file} file was a form, but was an invalid response")
        break

    process_formed[file] = tool_response.input

{file} file was not a form.
