# Use `gpt-4o` to extract data from construction PDFs

## How it works

- Convert each page of the PDF to JPEG (gpt-4o does not support PDFs yet)
- Use multimodal capabilities of `gpt-4o` to extract data from the images
- Enjoy

### Instructions

- Azure OpenAI env variables
- brew install poppler

In [28]:
## Convert pages to JPEG

import os
from rich import print

from pdf2image import convert_from_path

pages = convert_from_path(
    "./data/drawings/Sloping Site - Proposal.pdf",
    500
)

for count, page in enumerate(pages):
    page.save(f'./var/extract_pdf_data_llm/out{count}.jpg', 'JPEG')

In [29]:
def read_image_as_base64(image_path: str) -> str:
    import base64

    with open(image_path, "rb") as image_file:
        image_data = base64.b64encode(image_file.read()).decode("utf-8")

    return image_data

In [30]:
from langchain_openai.chat_models import AzureChatOpenAI
from langchain.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage


llm = AzureChatOpenAI(
    azure_deployment=os.getenv("AZURE_DEPLOYMENT"),
)

In [31]:
## Extract drawing data

class BoundariesSchema(BaseModel): 
    x: int = Field(..., description="X coordinate of the object in mm on the drawing")
    y: int = Field(..., description="Y coordinate of the object in mm on the drawing")
    height: int = Field(..., description="Height of the object in mm on the drawing")
    width: int = Field(..., description="Width of the object in mm on the drawing")

class ObjectSchema(BaseModel): 
    name: str = Field(..., description="Name of the object")
    description: str = Field(..., description="Optional free form details about the object")
    estimated_area: float | None = Field(..., description="Estimated area of the object in square meters")
    boundaries: BoundariesSchema = Field(..., description="Bounding box (x, y, height, width) of the object in mm on the drawing. Assuming drawing is A3")

class OutputSchema(BaseModel): 
    is_drawing: bool = Field(..., description="Detect if input image is drawing")
    title: str = Field(..., description="Title of the drawing")
    name: str = Field(..., description="Name of the drawing")
    date: str = Field(..., description="Date of the drawing")
    notes: str = Field(..., description="Notes of the drawing")
    contact_information: str = Field(..., description="Contact information")
    area: str = Field(..., description="Area information of the drawing in square meters")
    description: str = Field(..., description="Description of the drawing")
    extracted_text: str = Field(..., description="All text for search purposes")
    objects: list[ObjectSchema] = Field(..., description="List of objects depicted on the drawing")


def extract_drawing_info(page_content_base64: str) -> OutputSchema:
    prompt = ChatPromptTemplate.from_messages([
        HumanMessage(
            content=[
                {"type": "text", "text": "Parse drawing information and list of objects on this drawing"},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{page_content_base64}"},
                },
            ],
        ),
    ])
    
    chain = prompt | llm.with_structured_output(OutputSchema)

    return chain.invoke({})

def print_drawing_info(page_path: str):
    page_base64 = read_image_as_base64(page_path)

    print(extract_drawing_info(page_base64))

![Page 4](./var/extract_pdf_data_llm/out4.jpg)

In [32]:
print_drawing_info("./var/extract_pdf_data_llm/out4.jpg")

In [33]:
## Extract text from PDF

def extract_text_data(page_content_base64: str):
    prompt = ChatPromptTemplate.from_messages(
    [
        HumanMessage(
            content=[
                {"type": "text", "text": "Get text structured and grouped by topics. Output tables as Markdown tables. List as a lists. Format as Markdown"},
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{page_content_base64}"},
                },
            ],
        ),
    ]
)
    
    chain = prompt | llm

    return chain.invoke({})

def print_text_data(page_path: str):
    page_base64 = read_image_as_base64(page_path)

    print(extract_text_data(page_base64).content)

![Page 4](./var/extract_pdf_data_llm/out1.jpg)

In [34]:
print_text_data("./var/extract_pdf_data_llm/out1.jpg")