# [**Mistral OCR**](https://mistral.ai/news/mistral-ocr)

Check API usage [here](https://console.mistral.ai/usage).

---

In [1]:
!pip install mistralai lakefs docx2pdf slack-sdk -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from mistralai import Mistral
from lakefs.client import Client
from lakefs import Repository
from docx2pdf import convert
import os, io, json, base64, requests
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError

In [4]:
# ── Configuration ─────────────────────────────────────────────────────────────
API_KEY = os.environ.get("MISTRAL_AI_API_KEY")
URL = os.environ.get("MISTRAL_AI_URL")
LAKEFS_HOST = os.environ.get("LAKEFS_HOST")
LAKEFS_USERNAME = os.environ.get("LAKEFS_USERNAME")
LAKEFS_PASSWORD = os.environ.get("LAKEFS_PASSWORD")
LAKEFS_REPOSITORY = os.environ.get("lakefs_repository","vital-oceans")
LAKEFS_COMMIT = os.environ.get("lakefs_commit","92c647c3f1ac9bb79fdaae5860da56ad93b5eb4c522d5a328c30f7f7c700f7c3")
SLACK_TOKEN = os.environ["SLACK_BOT_TOKEN"]
SLACK_CHANNEL = os.environ.get("slack_channel", "#ocr-reports")

ocr_client = Mistral(API_KEY)
slack = WebClient(token=SLACK_TOKEN)

In [5]:
client = Client(
    host=LAKEFS_HOST,
    username=LAKEFS_USERNAME,
    password=LAKEFS_PASSWORD,
)
repo = Repository(repository_id=LAKEFS_REPOSITORY, client=client)
ref  = repo.ref(LAKEFS_COMMIT)

In [6]:
def get_signed_url_from_bytes(file_bytes: bytes, file_name: str) -> str:
    uploaded = ocr_client.files.upload(
        file={"file_name": file_name, "content": file_bytes},
        purpose="ocr"
    )
    signed = ocr_client.files.get_signed_url(file_id=uploaded.id)
    return signed.url

def run_mistral_ocr(file_name: str, signed_url: str) -> dict:
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "mistral-ocr-2503",
        "id": "mistral-ocr-latest",
        "document": {
            "type": "document_url",
            "document_url": signed_url,
            "document_name": file_name,
        },
        "include_image_base64": True
    }
    resp = requests.post(URL, json=payload, headers=headers)
    if resp.status_code == 429:
        raise RuntimeError(
            "Mistral AI API key rate limit exceeded (HTTP 429)."
        )
    if resp.status_code != 200:
        raise RuntimeError(f"OCR falló (HTTP {resp.status_code}): {resp.text}")
    return resp.json()

def process_ocr_response(file_name: str, ocr_response: dict):
    stem       = os.path.splitext(file_name)[0]
    output_dir = os.path.join("ocr-output", stem)
    os.makedirs(output_dir, exist_ok=True)

    md_path = os.path.join(output_dir, f"{stem}.md")
    img_idx = 0
    with open(md_path, 'w', encoding='utf-8') as md:
        for page in ocr_response.get('pages', []):
            md.write(page.get("markdown", ""))
            for image in page.get("images", []):
                b64 = image.get("image_base64", "").split(",")[-1]
                if not b64: continue
                img_file = f"img-{img_idx}.jpeg"
                with open(os.path.join(output_dir, img_file), "wb") as f:
                    f.write(base64.b64decode(b64))
                img_idx += 1

def send_error_report(file_name: str, error_msg: str):
    text = (
        f":warning: *Error OCR* procesando `{file_name}`:\n"
        f"```{error_msg}```"
    )
    try:
        slack.chat_postMessage(channel=SLACK_CHANNEL, text=text)
    except SlackApiError as e:
        print(f"Error enviando Slack: {e.response['error']}")

In [7]:
status = {
    "repository": LAKEFS_REPOSITORY,
    "commit":     LAKEFS_COMMIT,
    "processed_files": [],
    "errors":          []
}

for obj_info in ref.objects(prefix=""):
    key = obj_info.path
    if not key.lower().endswith((".pdf", ".docx")):
        continue

    file_name = os.path.basename(key)
    try:
        # 1) Leer bytes de lakeFS
        with ref.object(path=key).reader(mode="rb") as r:
            file_bytes = r.read()

        # 2) OCR
        signed    = get_signed_url_from_bytes(file_bytes, file_name)
        ocr_resp  = run_mistral_ocr(file_name, signed)
        process_ocr_response(file_name, ocr_resp)

        # 3) Marcar como procesado
        status["processed_files"].append(file_name)

    except Exception as e:
        err = str(e)
        status["errors"].append({"file": file_name, "error": err})
        send_error_report(file_name, err)

# 4) Escribir status.json
with open("status.json", "w", encoding="utf-8") as f:
    json.dump(status, f, indent=2)

print(
    f"✅ status.json escrito: "
    f"{len(status['processed_files'])} procesados, "
    f"{len(status['errors'])} errores."
)

✅ status.json escrito: 40 procesados, 0 errores.


---