In [None]:
!pip install pdf2image
# pdf2image requires to have `poppler` installed. On Mac, you can install it with 
# `brew install poppler`. On Ubuntu, `apt-get install -y poppler-utils`

In [None]:
import base64
import requests
import json
from pdf2image import convert_from_path

def pdf_to_images(pdf_path, dpi=200):
    """
    Convert a multi-page PDF into a list of PIL images (one per page).
    Returns the list of images.
    """
    return convert_from_path(pdf_path, dpi=dpi)

def image_to_base64(pil_image):
    """
    Convert a PIL image to a base64-encoded PNG bytes string.
    """
    import io
    buffer = io.BytesIO()
    pil_image.save(buffer, format="PNG")
    buffer.seek(0)
    img_str = base64.b64encode(buffer.read()).decode("utf-8")
    return img_str

def call_albert_api_with_image(base64_image, api_key, question="Please perform OCR on this page"):
    """
    Sends one image (base64) to the Albert API, along with a question prompt.
    Returns the response JSON.
    """
    url = "https://albert.api.etalab.gouv.fr/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    model = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant."
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": question
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                }
            ]
        }
    ]

    data = {
        "model": model,
        "messages": messages,
        "temperature": 0.15
    }

    response = requests.post(url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")


In [None]:

pdf_path = "/PATH/TO/PDF.pdf"
api_key = "***"
output_file = "ocr_result.txt"
images = pdf_to_images(pdf_path, dpi=200)
with open(output_file, "w", encoding="utf-8") as f:
    for page_idx, img in enumerate(images, start=1):
        print(f"Processing page {page_idx}...")
        base64_img = image_to_base64(img)

        question = f"Please perform OCR on this image. Only give the raw content, do not include boilerplate in your answer such as 'Certainly, here it is' or 'Yes, here you go'. If the page is blank, return 'blank page'."
        try:
            response_json = call_albert_api_with_image(base64_img, api_key, question=question)
            if "choices" in response_json and len(response_json["choices"]) > 0:
                content = response_json["choices"][0]["message"]["content"]
                print(f"Page {page_idx} OCR result:\n{content}\n")
                f.write(f"{content}\n\f")
            else:
                print(f"Page {page_idx}: Unexpected response:\n{response_json}\n")

        except Exception as e:
            print(f"Error OCRing page {page_idx}: {e}")