In [2]:
!pip install pdf2image

Collecting pdf2image
  Obtaining dependency information for pdf2image from https://files.pythonhosted.org/packages/62/33/61766ae033518957f877ab246f87ca30a85b778ebaad65b7f74fa7e52988/pdf2image-1.17.0-py3-none-any.whl.metadata
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pillow (from pdf2image)
  Obtaining dependency information for pillow from https://files.pythonhosted.org/packages/b9/d8/f6004d98579a2596c098d1e30d10b248798cceff82d2b77aa914875bfea1/pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl.metadata
  Downloading pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.1 kB)
Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading pillow-11.1.0-cp312-cp312-macosx_11_0_arm64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pillow, pdf2image
Successfully installed pdf2image-1.17.0 pillow-11.1.0

[1m[[0m[34;49

In [1]:
import base64
import requests
import json
from pdf2image import convert_from_path

def pdf_to_images(pdf_path, dpi=200):
    """
    Convert a multi-page PDF into a list of PIL images (one per page).
    Returns the list of images.
    """
    return convert_from_path(pdf_path, dpi=dpi)

def image_to_base64(pil_image):
    """
    Convert a PIL image to a base64-encoded PNG bytes string.
    """
    import io
    buffer = io.BytesIO()
    pil_image.save(buffer, format="PNG")
    buffer.seek(0)
    img_str = base64.b64encode(buffer.read()).decode("utf-8")
    return img_str

def call_albert_api_with_image(base64_image, api_key, question="Please perform OCR on this page"):
    """
    Sends one image (base64) to the Albert API, along with a question prompt.
    Returns the response JSON.
    """
    url = "https://albert.api.etalab.gouv.fr/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    model = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"

    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant."
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": question
                },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                }
            ]
        }
    ]

    data = {
        "model": model,
        "messages": messages,
        "temperature": 0.15
    }

    response = requests.post(url, headers=headers, data=json.dumps(data))
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Request failed with status {response.status_code}: {response.text}")


In [2]:

pdf_path = "/PATH/TO/PDF.pdf"
images = pdf_to_images(pdf_path, dpi=200)
api_key = "***"
output_file = "ocr_result.txt"
with open(output_file, "w", encoding="utf-8") as f:
    for page_idx, img in enumerate(images, start=1):
        print(f"Processing page {page_idx}...")
        base64_img = image_to_base64(img)

        question = f"Please perform OCR on this image. Only give the raw content, do not include boilerplate in your answer such as 'Certainly, here it is' or 'Yes, here you go'. If the page is blank, return 'blank page'."
        try:
            response_json = call_albert_api_with_image(base64_img, api_key, question=question)

            # Extract the text from the response (if the model is returning OCR text in 'message.content')
            # That depends on how the model structures OCR output
            if "choices" in response_json and len(response_json["choices"]) > 0:
                content = response_json["choices"][0]["message"]["content"]
                print(f"Page {page_idx} OCR result:\n{content}\n")
                f.write(f"{content}\n\f")
            else:
                print(f"Page {page_idx}: Unexpected response:\n{response_json}\n")

        except Exception as e:
            print(f"Error OCRing page {page_idx}: {e}")

Processing page 1...
Page 1 OCR result:
blank page

Processing page 2...
Page 2 OCR result:
blank page

Processing page 3...
Page 3 OCR result:
# Information


This document, written by ANSSI, the French National Cybersecurity Agency, is titled “Security recommendations for a generative AI system”. It is freely available at cyber.gouv.fr/en.

It is an original creation from ANSSI and it is placed under the “Open Licence v2.0” published by the Etalab mission.

According to the Open Licence v2.0, this document can be freely reused, subject to mentioning its paternity (source and date of last update). Reuse means the right to communicate, distribute, redistribute, publish, transmit, reproduce, copy, adapt, modify, extract, transform and use, including for commercial purposes

The recommendations are provided as is and are related to threats known at the publication time. Considering the information systems diversity, ANSSI cannot guarantee direct application of these recommendations on ta