In [2]:
import os
import io
import base64

from pydantic import BaseModel, Field
from typing import List, Any
from pdf2image import convert_from_path

from openai import AzureOpenAI

from dotenv import load_dotenv

load_dotenv()

api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

client = AzureOpenAI(
    base_url=f"{api_base}/openai/deployments/{deployment_name}",
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
)

extraction_prompt = """
You are an expert in extracting metadata from permit images. Given an image of a permit, extract the following metadata fields:
- Issue Date
  - The date when the permit was issued, formatted as dd MMMM yyyy (e.g., 25 December 2023)
  - Use english month names only
- Expiration Date
  - The date when the permit expires, formatted as dd MMMM yyyy (e.g., 25 December 2024)
  - Use english month names only
- Permit Type: Permit type is categorized as PLO, KKPR/KKPRL, or Ijin Lingkungan
- Summary: A brief summary of the permit content in no more than 100 words.
  - Summarize the main points of the permit, including its purpose, scope, and any important conditions or requirements mentioned.

Return the extracted metadata in the JSON format. Do not include any additional text or explanations.
"""

class PermitMetadata(BaseModel):
    issue_date: str = Field(..., description="The date when the permit was issued.")
    expiration_date: str = Field(..., description="The date when the permit expires.")
    permit_type: str = Field(..., description="The type/category of the permit.")
    summary: str = Field(..., description="A brief summary of the permit content.")

def base64_encoded_image(image, format: str = 'JPEG') -> str:
    """
    Generate a base64 encoded string from image object.
    
    Args:
        image: Image object (e.g., PIL.Image).
        
    Returns:
        str: Base64 encoded string of the image.
    """

    buffer = io.BytesIO()
    image.save(buffer, format=format)
    buffer.seek(0)
    encoded_str = base64.b64encode(buffer.read()).decode('utf-8')
    buffer.close()

    image_data = f"data:image/{format.lower()};base64,{encoded_str}"

    return image_data

def create_message(system_prompt: str, image_content: List[Any]) -> dict:
    """
    Create a message payload for the chat API.

    Args:
        system_prompt (str): The system prompt to include in the message.
        image_content (List[Any]): The content to include in the message.

    Returns:
        dict: The message payload.
    """
    message = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": [
            {
                "type" : "text",
                "text": "Extract the metadata from the permit image."
            }
            ]
        }
    ]

    for content in image_content:
        message[1]['content'].append({"type": "image_url", "image_url": {"url" : base64_encoded_image(content)}})

    return message

In [12]:
import instructor

instructor_client = instructor.from_provider("azure_openai/gpt-4o")

FILE_PATH = "data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU II/PLO CDU KILANG SEI PAKNING.pdf"

images = convert_from_path(FILE_PATH)
message_payload = create_message(extraction_prompt, images)

user, completion = instructor_client.chat.completions.create_with_completion(
    messages=message_payload,
    response_model=PermitMetadata
)

INFO:instructor.auto_client:Initializing azure_openai provider with model gpt-4o
INFO:instructor.auto_client:Client initialized
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"


In [98]:
user

PermitMetadata(issue_date='19 December 2022', expiration_date='18 July 2026', permit_type='PLO', summary='This permit approves the operational feasibility of the Crude Distillation Unit (CDU) at PT Kilang Pertamina Internasional, located in Bengkalis, Riau. The installation comprises crude oil-based processing facilities to produce intermediate products such as Naptha, Kero, ADO, and LSWR. The CDU satisfies inspection requirements, including technical standards, design limits (50000 BPSD capacity), and periodic technical inspections every 6 months for safety systems like fire protection, instrument systems, and temperature controls. Inspection activities and methods are detailed, emphasizing compliance with established safety and environmental regulations. The permit remains valid until 18 July 2026.')

In [15]:
from datetime import datetime
from pprint import pprint

metadata = {
    "file_path": FILE_PATH,
    "issue_date": datetime.strptime(user.issue_date, "%d %B %Y").strftime("%Y-%m-%d"),
    "expiration_date": datetime.strptime(user.expiration_date, "%d %B %Y").strftime("%Y-%m-%d"),
    "permit_type": user.permit_type,
    "summary": user.summary
}

pprint(metadata)

{'expiration_date': '2026-07-18',
 'file_path': 'data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU II/PLO CDU '
              'KILANG SEI PAKNING.pdf',
 'issue_date': '2022-12-19',
 'permit_type': 'PLO',
 'summary': 'The permit is approval for the operational feasibility of the '
            'Crude Distillation Unit (CDU) by PT Kilang Pertamina '
            'Internasional. It outlines design and operational specifications, '
            'including technical standards, capacities, and safety protocols. '
            'Inspections and risk evaluations are mandated, emphasizing '
            'adherence to regulatory standards, regular equipment checks, and '
            'risk mitigation measures to ensure safe and efficient operation. '
            'Compliance with environmental guidelines is emphasized.'}


In [49]:
print(completion.usage)

CompletionUsage(completion_tokens=160, prompt_tokens=5771, total_tokens=5931, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=None), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=5760))


In [1]:
import logging
import asyncio

from tenacity import retry, wait_exponential, stop_after_attempt, before_sleep_log

logging.basicConfig(level=logging.INFO)

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=4, max=10),
    before_sleep=before_sleep_log(logging.getLogger(), logging.WARNING)
)
async def process_permit_metadata_async(file_path: str):
    loop = asyncio.get_event_loop()
    images = await loop.run_in_executor(None, convert_from_path, file_path)
    message_payload = create_message(extraction_prompt, images)
    logging.info(f"Processing file: {file_path} with {len(images)} pages.")

    user, completion = await loop.run_in_executor(
        None,
        instructor_client.chat.completions.create_with_completion,
        message_payload,
        PermitMetadata
    )

    return {
        "file_path": file_path,
        "issue_date": datetime.strptime(user.issue_date, "%d %B %Y").strftime("%Y-%m-%d"),
        "expiration_date": datetime.strptime(user.expiration_date, "%d %B %Y").strftime("%Y-%m-%d"),
        "permit_type": user.permit_type,
        "summary": user.summary,
        "usage": completion.usage
    }

In [11]:
import os

folder_path = "data/folder"
file_list = []
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.lower().endswith(".pdf"):
            file_list.append(os.path.join(root, file))

print(file_list)

['data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 1/SK Persetujuan Lingkungan UKL-UPL Stasiun Gas Dumai.PDF', 'data/folder/PGN_Permits/4. Perling_PGN/PL Th 2023 - Jargas CGP/Perling - PKPLH  Jargas Kota Yogyakarta dan Kab Sleman.pdf', 'data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan PMO/PKPLH UKL UPL KIK-Mangkang, Kendal.pdf', 'data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persetujuan PKPLH Upgrading Offtake Garawangi.PDF', 'data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persetujuan PKPLH Upgrading Offtake Sunyaragi.PDF', 'data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persling UKL-UPL Cabang Jkt.pdf', 'data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persetujuan Lingkungan UKL-UPL Cabang Bogor.PDF', 'data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persetujuan SKKL AMDAL PDJB.PDF', 'data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingk

In [12]:
from PyPDF2 import PdfReader

file_pages = []

def count_pdf_pages(file_path):
    reader = PdfReader(file_path)
    return len(reader.pages)

for file_path in file_list:
    file_pages.append(count_pdf_pages(file_path))

In [10]:
long_files = []

for file_path, num_pages in zip(file_list, file_pages):
    if num_pages > 10:
        print(f"File {file_path} has {num_pages} pages.")
        long_files.append(file_path)

print(f"Files with more than 10 pages: {len(long_files)}")

File data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 1/SK Persetujuan Lingkungan UKL-UPL Stasiun Gas Dumai.PDF has 37 pages.
File data/folder/PGN_Permits/4. Perling_PGN/PL Th 2023 - Jargas CGP/Perling - PKPLH  Jargas Kota Yogyakarta dan Kab Sleman.pdf has 58 pages.
File data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan PMO/PKPLH UKL UPL KIK-Mangkang, Kendal.pdf has 31 pages.
File data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persetujuan PKPLH Upgrading Offtake Garawangi.PDF has 24 pages.
File data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persetujuan PKPLH Upgrading Offtake Sunyaragi.PDF has 52 pages.
File data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persling UKL-UPL Cabang Jkt.pdf has 48 pages.
File data/folder/PGN_Permits/4. Perling_PGN/Persetujuan Lingkungan SOR 2/SK Persetujuan Lingkungan UKL-UPL Cabang Bogor.PDF has 17 pages.
File data/folder/PGN_Permits/4. Perling_PGN/Persetuj

In [5]:
short_files = []

for file_path, num_pages in zip(file_list, file_pages):
    if num_pages <= 10:
        short_files.append(file_path)

print(f"Files with 10 or fewer pages: {len(short_files)}")

Files with 10 or fewer pages: 63


In [16]:
tasks = [process_permit_metadata_async(file_path) for file_path in short_files[:20]]
metadata_result_test = await asyncio.gather(*tasks)

INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU VI/PIPELINE CRUDE OIL 24IN FROM SPM 150.000 DWT_1.pdf with 2 pages.
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU VI/PIPELINE NAPTHA 16X20IN FROM SPM 35.000 DWT.pdf with 2 pages.
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU VI/PIPELINE HOMC 16IN FROM SPM 17.500 DWT_1.pdf with 2 pages.
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU VI/PIPELINE CRUDE OIL 36IN FROM SPM 150.000 DWT_1.pdf with 2 pages.
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU VI/PIPELINE FLUSHING OIL 14IN FROM SPM 17.500 DWT_1.pdf with 2 pages.
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU V/1378_SKPI_SKPI OM DAN LAWE2 Feb 2018-Jan 2023.pdf with 1 pages.
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU II/PLO UTL KILANG SEI PAKNING.pdf with 5 pages.
INFO:ro

In [88]:
metadata_result = []

for file_path in short_files:
    metadata = process_permit_metadata(file_path)
    metadata_result.append(metadata)    

INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU II/PLO CDU KILANG SEI PAKNING.pdf with 6 pages.
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU II/PLO ITY KILANG SEI PAKNING.pdf with 6 pages.
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU II/PLO UTL KILANG SEI PAKNING.pdf with 5 pages.
INFO:httpx:HTTP Request: POST https://openai-permit.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-02-01 "HTTP/1.1 200 OK"
INFO:root:Processing file: data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU VI/PLO SPL-SPM 165000 DWT.pdf with 4 pages.
INFO:httpx:HTTP Request: POST https://ope

In [91]:
metadata_result

[{'file_path': 'data/folder/KPI_Permits/Sertifikat_PLO_SKPI/PLO RU II/PLO CDU KILANG SEI PAKNING.pdf',
  'issue_date': '2022-12-19',
  'expiration_date': '2026-07-18',
  'permit_type': 'PLO',
  'summary': "This permit grants 'fit-for-operation' approval to PT Kilang Pertamina Internasional for their Crude Distillation Unit (CDU) at Bukit Batu, Bengkalis Riau. The approval follows technical inspections ensuring compliance with safety and operational standards. It specifies design and operational parameters, risk analysis, and inspection intervals. PT Kilang Pertamina Internasional bears full responsibility for environmental safety and adherence to conditions outlined in the permit. The document also highlights technical details and inspection protocols for the installation and equipment used.",
  'usage': CompletionUsage(completion_tokens=133, prompt_tokens=5771, total_tokens=5904, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=None, audio_tokens=0, reasoni

In [93]:
import json

# Convert usage objects to dictionaries for JSON serialization
def serialize_metadata(metadata_list):
    serialized = []
    for item in metadata_list:
        serialized_item = item.copy()
        if 'usage' in serialized_item and serialized_item['usage'] is not None:
            serialized_item['usage'] = serialized_item['usage'].model_dump()
        serialized.append(serialized_item)
    return serialized

# Write metadata_result to JSON file
output_file = "data/metadata_result.json"

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(serialize_metadata(metadata_result), f, indent=2, ensure_ascii=False)

print(f"Metadata saved to {output_file}")

Metadata saved to data/metadata_result.json
