# [**Mistral OCR**](https://mistral.ai/news/mistral-ocr)

Check API usage [here](https://console.mistral.ai/usage).

---

In [None]:
!pip install mistralai lakefs docx2pdf -q

In [None]:
from mistralai import Mistral
import requests
import base64
import os
from lakefs.client import Client
from lakefs import Repository
from docx2pdf import convert
import io

In [None]:
# ── Configuration ─────────────────────────────────────────────────────────────
API_KEY = os.environ.get("MISTRAL_AI_API_KEY")
URL = os.environ.get("MISTRAL_AI_URL")
LAKEFS_HOST = os.environ.get("LAKEFS_HOST")
LAKEFS_USERNAME = os.environ.get("LAKEFS_USERNAME")
LAKEFS_PASSWORD = os.environ.get("LAKEFS_PASSWORD")
LAKEFS_REPOSITORY = os.environ.get("lakefs_repository","vital-oceans")
LAKEFS_COMMIT = os.environ.get("lakefs_commit","92c647c3f1ac9bb79fdaae5860da56ad93b5eb4c522d5a328c30f7f7c700f7c3")

ocr_client = Mistral(API_KEY)

In [None]:
client = Client(
    host=LAKEFS_HOST,
    username=LAKEFS_USERNAME,
    password=LAKEFS_PASSWORD,
)
repo = Repository(repository_id=LAKEFS_REPOSITORY, client=client)
ref  = repo.ref(LAKEFS_COMMIT)

In [None]:
# ── Helpers ───────────────────────────────────────────────────────────────────
def get_signed_url_from_bytes(file_bytes: bytes, file_name: str) -> str:
    """Upload raw bytes to Mistral and get a signed URL for OCR."""
    uploaded = ocr_client.files.upload(
        file={
            "file_name": file_name,
            "content": file_bytes,      # raw bytes
        },
        purpose="ocr"
    )
    # <-- use keyword arg here
    signed = ocr_client.files.get_signed_url(file_id=uploaded.id)
    return signed.url


def run_mistral_ocr(file_name: str, signed_url: str) -> dict:
    """Send the document URL to Mistral OCR and return the parsed response,
    with special handling for API rate limits (HTTP 429)."""
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "mistral-ocr-2503",
        "id": "mistral-ocr-latest",
        "document": {
            "type": "document_url",
            "document_url": signed_url,
            "document_name": file_name,
        },
        "include_image_base64": True
    }

    print(f"\033[1mProcessing {file_name}...\033[0m")
    resp = requests.post(URL, json=payload, headers=headers)

    if resp.status_code == 429:
        # rate limit hit
        raise RuntimeError(
            "Mistral AI API key rate limit exceeded (HTTP 429). "
            "Please wait and retry after some time."
        )
    if resp.status_code != 200:
        # other errors
        raise RuntimeError(
            f"OCR failed (HTTP {resp.status_code}): {resp.text}"
        )

    info = resp.json()
    pages = info.get("usage_info", {}).get("pages_processed")
    print(f" → processed {pages} pages.")
    return info

def process_ocr_response(file_name: str, ocr_response: dict):
    """Saves the OCR result as Markdown and extracts images, using only the basename."""
    # file_name is something like "mydoc.pdf"
    stem = os.path.splitext(file_name)[0]  # => "mydoc"
    output_dir = os.path.join("ocr-output", stem)
    os.makedirs(output_dir, exist_ok=True)

    md_path = os.path.join(output_dir, f"{stem}.md")
    img_idx = 0

    with open(md_path, 'w', encoding='utf-8') as md:
        for page in ocr_response.get('pages', []):
            md.write(page.get("markdown", ""))
            for image in page.get("images", []):
                b64 = image.get("image_base64", "").split(",")[-1]
                if not b64:
                    continue
                img_file = f"img-{img_idx}.jpeg"
                img_path = os.path.join(output_dir, img_file)
                with open(img_path, "wb") as f:
                    f.write(base64.b64decode(b64))
                img_idx += 1

In [None]:
for obj_info in ref.objects(prefix=""):
    path = obj_info.path
    if not path.lower().endswith((".pdf", ".docx")):
        continue

    # Use only the filename (no directories) for all downstream naming
    file_name = os.path.basename(path)

    # Stream the file bytes directly from lakeFS
    with ref.object(path=path).reader(mode="rb") as reader:
        file_bytes = reader.read()

    # Upload bytes, OCR, and write outputs
    signed_url = get_signed_url_from_bytes(file_bytes, file_name)
    ocr_resp   = run_mistral_ocr(file_name, signed_url)
    process_ocr_response(file_name, ocr_resp)

---