# MISTRAL OCR PDF to MD Translator

## 來源說明

本 Notebook 修改自 Mistral 官方範例：

- 原始文件說明：[https://docs.mistral.ai/capabilities/document/](https://docs.mistral.ai/capabilities/document/)
- 原始 Colab Notebook：[https://colab.research.google.com/github/mistralai/cookbook/blob/main/mistral/ocr/structured_ocr.ipynb](https://colab.research.google.com/github/mistralai/cookbook/blob/main/mistral/ocr/structured_ocr.ipynb)

> 本 Notebook 為個人學習與實驗用途所做修改，非官方版本。


## PDF Mistral OCR 匯出工具

本 Notebook 可將 PDF 文件自動化轉換為 Markdown 格式，包含以下流程：

1. 使用 **Mistral OCR** 模型辨識 PDF 內文與圖片
2. 將辨識結果組成含圖片的 Markdown 檔
3. 使用 **Gemini** 模型將英文內容翻譯為**台灣繁體中文**
4. 匯出 Markdown 檔（原文版 + 翻譯版）與對應圖片

In [None]:
from mistralai import Mistral
from mistralai.models import OCRResponse, ImageURLChunk, DocumentURLChunk
from IPython.display import Markdown, display
from pathlib import Path
import base64, os, json
import os

from dotenv import load_dotenv  # ✅ 要安裝 python-dotenv

load_dotenv()  # 👈 這行會自動讀取 .env 中的 key

api_key = os.getenv("MISTRAL_API_KEY")

if not api_key:
    raise ValueError("❌ 找不到 MISTRAL_API_KEY，請檢查 .env 是否正確設置。")

client = Mistral(api_key=api_key)

# 建立暫時圖片資料夾（後面會用自動命名替代）
os.makedirs("images", exist_ok=True)


In [None]:
pdf_dir = Path(".")  # 或指定資料夾
pdf_files = sorted([f for f in pdf_dir.glob("*.pdf")])

if not pdf_files:
    raise FileNotFoundError("❌ 沒有找到任何 PDF 檔案，請確認放在正確資料夾中。")

print("📚 可用的 PDF 檔案：")
for i, f in enumerate(pdf_files):
    print(f"  [{i+1}] {f.name}")

choice = input("👉 請輸入要處理的檔案編號： ").strip()
pdf_file = pdf_files[int(choice)-1]
filename_stem = pdf_file.stem

print(f"✅ 已選擇：{pdf_file}")
# 上傳到 mistral
uploaded_file = client.files.upload(
    file={
        "file_name": pdf_file.stem,
        "content": pdf_file.read_bytes(),
    },
    purpose="ocr"
)

signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)

# OCR 分析 PDF
pdf_response = client.ocr.process(
    document=DocumentURLChunk(document_url=signed_url.url),
    model="mistral-ocr-latest",
    include_image_base64=True
)

# Convert response to JSON format
response_dict = json.loads(pdf_response.model_dump_json())

print(json.dumps(response_dict, indent=4)[0:1000]) # check the first 1000 characters

In [None]:
from mistralai.models import OCRResponse
from IPython.display import Markdown, display

def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
    """
    Replace image placeholders in markdown with base64-encoded images.

    Args:
        markdown_str: Markdown text containing image placeholders
        images_dict: Dictionary mapping image IDs to base64 strings

    Returns:
        Markdown text with images replaced by base64 data
    """
    for img_name, base64_str in images_dict.items():
        markdown_str = markdown_str.replace(
            f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
        )
    return markdown_str

def get_combined_markdown(ocr_response: OCRResponse) -> str:
    """
    Combine OCR text and images into a single markdown document.

    Args:
        ocr_response: Response from OCR processing containing text and images

    Returns:
        Combined markdown string with embedded images
    """
    markdowns: list[str] = []
    # Extract images from page
    for page in ocr_response.pages:
        image_data = {}
        for img in page.images:
            image_data[img.id] = img.image_base64
        # Replace image placeholders with actual images
        markdowns.append(replace_images_in_markdown(page.markdown, image_data))

    return "\n\n".join(markdowns)

# Display combined markdowns and images
display(Markdown(get_combined_markdown(pdf_response)))

In [None]:
from pydantic import BaseModel
from mistralai.models import TextChunk
import time

class StructuredOCR(BaseModel):
    file_name: str
    topics: list[str]
    languages: str
    ocr_contents: dict

def retry_with_backoff(func, retries=5, base_delay=1.5):
    for attempt in range(retries):
        try:
            return func()
        except Exception as e:
            if "429" in str(e):
                wait_time = base_delay * (2 ** attempt)
                print(f"⚠️ API rate limit hit. Retrying in {wait_time:.1f}s...")
                time.sleep(wait_time)
            else:
                raise e
    raise RuntimeError("❌ Failed after multiple retries.")

image_ocr_results = {}

for page_idx, page in enumerate(pdf_response.pages):
    for i, img in enumerate(page.images):
        base64_data_url = img.image_base64
        file_name = f"page_{page_idx+1}_img_{i+1}.png"

        def run_ocr_and_parse():
            # Step 1: basic OCR
            image_response = client.ocr.process(
                document=ImageURLChunk(image_url=base64_data_url),
                model="mistral-ocr-latest"
            )
            image_ocr_markdown = image_response.pages[0].markdown

            # Step 2: 結構化 OCR markdown
            structured = client.chat.parse(
                model="pixtral-12b-latest",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            ImageURLChunk(image_url=base64_data_url),
                            TextChunk(text=(
                                f"This is the image's OCR in markdown:\n{image_ocr_markdown}\n. "
                                "Convert this into a structured JSON response with the OCR contents in a sensible dictionary."
                            ))
                        ]
                    }
                ],
                response_format=StructuredOCR,
                temperature=0
            )

            structured_data = structured.choices[0].message.parsed
            pretty_text = json.dumps(structured_data.ocr_contents, indent=2, ensure_ascii=False)
            return pretty_text

        try:
            result = retry_with_backoff(run_ocr_and_parse, retries=4)
            image_ocr_results[(page_idx, img.id)] = result
        except Exception as e:
            print(f"❌ Failed at page {page_idx+1}, image {i+1}: {e}")


In [None]:
def insert_ocr_below_images(markdown_str, ocr_img_map, page_idx):
    for img_id, ocr_text in ocr_img_map.get(page_idx, {}).items():
        markdown_str = markdown_str.replace(
            f"![{img_id}]({img_id})",
            f"![{img_id}]({img_id})\n\n> 📄 Image OCR Result：\n\n```json\n{ocr_text}\n```"
        )
    return markdown_str

# 重建 ocr_by_page
ocr_by_page = {}
for (page_idx, img_id), ocr_text in image_ocr_results.items():
    ocr_by_page.setdefault(page_idx, {})[img_id] = ocr_text


In [None]:
def save_images_and_replace_links(markdown_str, images_dict, page_idx, image_folder="images"):
    os.makedirs(image_folder, exist_ok=True)
    image_id_to_path = {}

    for i, (img_id, base64_str) in enumerate(images_dict.items()):
        img_bytes = base64.b64decode(base64_str.split(",")[-1])
        img_path = f"{image_folder}/page_{page_idx+1}_img_{i+1}.png"
        with open(img_path, "wb") as f:
            f.write(img_bytes)
        image_id_to_path[img_id] = img_path

    for img_id, img_path in image_id_to_path.items():
        markdown_str = markdown_str.replace(
            f"![{img_id}]({img_id})", f"![{img_id}]({img_path})"
        )

    return markdown_str


In [None]:
from google import genai
from google.genai import types


# ✅ 載入 .env 檔案
load_dotenv()

# ✅ 讀取 API 金鑰
gemini_api_key = os.getenv("GEMINI_API_KEY")
if not gemini_api_key:
    raise ValueError("❌ 未在 .env 找到 GEMINI_API_KEY，請確認已正確設置。")

# ✅ 初始化 Gemini client
client = genai.Client(api_key=gemini_api_key)

SYSTEM_INSTRUCTION = """
你是一位專業的技術文件翻譯者。請將我提供的英文 Markdown 內容翻譯成**台灣繁體中文**。

**核心要求：**
1.  **翻譯所有英文文字：** 你的主要工作是翻譯內容中的英文敘述性文字（段落、列表、表格等）。
2.  **保持結構與程式碼不變：**
    * **不要**更改任何 Markdown 標記（如 `#`, `*`, `-`, `[]()`, `![]()`, ``` ```, ` `` `, `---`）。
    * **不要**翻譯或修改程式碼區塊 (``` ... ```) 和行內程式碼 (`code`) 裡的任何內容。
    * 若有 JSON，**不要**更改鍵（key），僅翻譯字串值（value）。
3.  **處理專有名詞：** 對於普遍接受的英文技術術語、縮寫或專有名詞（例如 API, SDK, CPU, Google, Python 等），傾向於**保留英文原文**。但請確保翻譯了其他所有非術語的常規英文文字。
4.  **直接輸出結果：** 請直接回傳翻譯後的完整 Markdown 文件，不要添加任何額外說明。
"""


def translate_markdown_pages(pages):
    translated_pages = []

    for idx, page in enumerate(pages):
        try:
            print(f"🔁 正在翻譯第 {idx+1} 頁...")

            response = client.models.generate_content(
                model="gemini-2.0-flash",
                config=types.GenerateContentConfig(
                    system_instruction=SYSTEM_INSTRUCTION
                ),
                contents=page
            )

            translated_md = response.text.strip()
            translated_pages.append(translated_md)

        except Exception as e:
            print(f"⚠️ 翻譯第 {idx+1} 頁失敗：{e}")
            translated_pages.append(page)

    return translated_pages


In [None]:
from pathlib import Path

filename_stem = pdf_file.stem

markdown_pages = []

for page_idx, page in enumerate(pdf_response.pages):
    images_dict = {img.id: img.image_base64 for img in page.images}

    md = page.markdown
    md = insert_ocr_below_images(md, ocr_by_page, page_idx)
    image_folder_name = f"images_{filename_stem}"
    md = save_images_and_replace_links(md, images_dict, page_idx, image_folder=image_folder_name)


    markdown_pages.append(md)


# ✅ 執行翻譯
translated_markdown_pages = translate_markdown_pages(markdown_pages)

# ✅ 組合為完整 markdown 字串（保留分頁分隔線）
final_markdown_translated = "\n\n---\n\n".join(translated_markdown_pages)
final_markdown_original = "\n\n---\n\n".join(markdown_pages)

# 預覽翻譯版本
display(Markdown(final_markdown_translated))

In [None]:
# 🔽 設定檔名
translated_md_name = f"{filename_stem}_translated.md"
original_md_name = f"{filename_stem}_original.md"
image_folder_name = f"images_{filename_stem}"

# 儲存翻譯後檔案
with open(translated_md_name, "w", encoding="utf-8") as f:
    f.write(final_markdown_translated)

# 儲存英文 OCR 原始檔案
with open(original_md_name, "w", encoding="utf-8") as f:
    f.write(final_markdown_original)


# 下載所有檔案
print(f"✅ 已儲存翻譯版：{translated_md_name}")
print(f"✅ 已儲存原始英文版：{original_md_name}")
print(f"✅ 圖片資料夾：{image_folder_name}")
