In [1]:
import base64
import json
import os
# import re
from pathlib import Path

import markdown
from IPython.display import Markdown, display
from mistralai import Mistral
from mistralai.models import OCRResponse

# Set up the API key
api_key = os.environ["MISTRAL_API_KEY"]
client = Mistral(api_key=api_key)

In [None]:
# # Define the file path
# file_path = r"/mnt/mydisk/Projects/plu/data/external/Grenoble/Dispositions_Generales.pdf"

# # Upload the file
# uploaded_pdf = client.files.upload(
#     file={
#         "file_name": "Dispositions_Generales.pdf",
#         "content": open(file_path, "rb"),
#     },
#     purpose="ocr",
# )

signed_url = client.files.get_signed_url(file_id="e086b2b4-82e4-45fc-be98-168361e2106e")

ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": signed_url.url,
    },
    include_image_base64=True,
)

response_dict = json.loads(ocr_response.model_dump_json(indent=4))
json_string = json.dumps(response_dict, indent=4)
print(json_string)

# # Sauvegarder dans un fichier texte
# with open(
#     "/mnt/mydisk/Projects/plu/data/raw/Dispositions_Generales.json", "w", encoding="utf-8"
# ) as file:
#     file.write(json_string)

In [None]:
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )
    return markdown_str


def get_combined_markdown(ocr_response: OCRResponse) -> str:
    markdowns: list[str] = []
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)


display(Markdown(get_combined_markdown(ocr_response)))

In [None]:
def save_as_html(ocr_response, output_path="ocr_result.html"):
    """Save OCR results as a single HTML file with embedded images"""
    combined_markdown = get_combined_markdown(ocr_response)

    # Convert markdown to HTML (using markdown library)
    html_content = markdown.markdown(combined_markdown)

    # Wrap in basic HTML document structure
    html_doc = f"""<!DOCTYPE html>
    <html>
    <head>
        <title>OCR Results</title>
        <meta charset="utf-8">
    </head>
    <body>
        {html_content}
    </body>
    </html>
    """

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_doc)

    print(f"HTML saved to {output_path}")


def save_as_markdown_with_images(ocr_response, output_dir="ocr_results"):
    """Save OCR results as a markdown file with separate image files"""
    # Create directory if it doesn't exist
    Path(output_dir).mkdir(exist_ok=True)

    # Create images subdirectory
    images_dir = os.path.join(output_dir, "images")
    Path(images_dir).mkdir(exist_ok=True)

    all_markdown = []

    for i, page in enumerate(ocr_response.pages):
        page_markdown = page.markdown

        # Extract and save images
        for img in page.images:
            # Decode base64 image
            img_data = base64.b64decode(
                img.image_base64.split(",")[1] if "," in img.image_base64 else img.image_base64
            )

            # Determine image format (assuming it's encoded in the base64 string)
            img_format = "png"  # Default to png
            if "image/jpeg" in img.image_base64:
                img_format = "jpg"
            elif "image/png" in img.image_base64:
                img_format = "png"

            # Save image to file
            img_filename = f"{img.id}.{img_format}"
            img_path = os.path.join(images_dir, img_filename)
            with open(img_path, "wb") as img_file:
                img_file.write(img_data)

            # Update markdown to reference local image file
            img_rel_path = os.path.join("images", img_filename).replace("\\", "/")
            page_markdown = page_markdown.replace(
                f"![{img.id}]({img.image_base64})", f"![{img.id}]({img_rel_path})"
            )

        all_markdown.append(page_markdown)

    # Write combined markdown to file
    markdown_path = os.path.join(output_dir, "ocr_results.md")
    with open(markdown_path, "w", encoding="utf-8") as md_file:
        md_file.write("\n\n".join(all_markdown))

    print(f"Markdown saved to {markdown_path}")
    print(f"Images saved to {images_dir}")

# Option 1: Save as a single HTML file
save_as_html(ocr_response, "my_ocr_results.html")

# Option 2: Save as markdown with separate image files
save_as_markdown_with_images(ocr_response, "my_ocr_results")