In [None]:
import os
SOURCE_DIR = 'data'
print(os.listdir(SOURCE_DIR))

In [None]:
import fitz  # pip install PyMuPDF
import pdfplumber  # pip install pdfplumber
import re

DEST_DIR = 'parsed_txt'
os.makedirs(DEST_DIR, exist_ok=True)

SYMBOL_MAPPING = {
    "": "A. ", "": "B. ", "": "C. ", "": "D. "
}

pattern = re.compile("|".join(map(re.escape, SYMBOL_MAPPING.keys())))

def replace_symbol(match):
    return SYMBOL_MAPPING[match.group(0)]

def pdf2txt_fitz(source_file_path):
    text = []
    with fitz.open(source_file_path) as doc:
        for page in doc:
            page_txt = page.get_text("text")
            replace_txt = pattern.sub(replace_symbol, page_txt)
            text.append(replace_txt)  # 取得純文字
    return "\n\n".join(text)

def pdf2txt_pdfplumber(source_file_path):
    text = []
    with pdfplumber.open(source_file_path) as doc:
        for page in doc.pages:
            page_txt = page.extract_text()
            replace_txt = pattern.sub(replace_symbol, page_txt)
            text.append(replace_txt)  # 取得純文字
    return "\n\n".join(text)

def txt_dump(file_path, data):
    print("write result to: " + file_path)
    with open(file_path, 'w') as f:
        f.write(data)

file_names = os.listdir(SOURCE_DIR)

for file_name in file_names:
    file_path = os.path.join(SOURCE_DIR, file_name)
    base_name, _ext = os.path.splitext(os.path.basename(file_path))
    txt_fitz = pdf2txt_fitz(file_path)
    txt_pdfplumber = pdf2txt_pdfplumber(file_path)
    txt_dump(os.path.join(DEST_DIR, f"{base_name}_fitz.txt"), txt_fitz)
    txt_dump(os.path.join(DEST_DIR, f"{base_name}_pdfplumber.txt"), txt_pdfplumber)

In [None]:
file_path

# llama-parse

In [None]:
from dotenv import find_dotenv, load_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
from llama_parse import LlamaParse

parser = LlamaParse(
   # api_key="llx-...",  # if you did not create an environmental variable you can set the API key here
   result_type="text",  # "markdown" and "text" are available
   language = 'ch_tra',
   )

file_name = 'data/104-2_內外科護理學.pdf'
extra_info = {"file_name": file_name}

with open(f"./{file_name}", "rb") as f:
   # must provide extra_info with file_name key with passing file object
   documents = parser.load_data(f, extra_info=extra_info)

with open("parsed_txt/104-2_內外科護理學_llama-parse.txt", "w", encoding="utf-8") as f:
   for doc in documents:
       f.write(doc.text)

In [None]:
from llama_parse import LlamaParse

parser = LlamaParse(
   # api_key="llx-...",  # if you did not create an environmental variable you can set the API key here
   result_type="text",  # "markdown" and "text" are available
   language = 'ch_tra',
   )

file_name = 'data/104-2_內外科護理學.pdf'
extra_info = {"file_name": file_name}

with open(f"./{file_name}", "rb") as f:
   # must provide extra_info with file_name key with passing file object
   documents = parser.load_data(f, extra_info=extra_info)

with open("parsed_txt/104-2_內外科護理學_llama-parse.txt", "w", encoding="utf-8") as f:
   for doc in documents:
       f.write(doc.text)

In [None]:
file_path = 'data/108-1_精神科與社區衛生護理.pdf'

# mistral ocr

In [None]:
from mistralai import Mistral
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=api_key)

In [None]:
from pathlib import Path
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
import json

# Verify PDF file exists
pdf_file = Path(file_path)
assert pdf_file.is_file()

In [None]:
# Upload PDF file to Mistral's OCR service
uploaded_file = client.files.upload(
    file={
        "file_name": pdf_file.stem,
        "content": pdf_file.read_bytes(),
    },
    purpose="ocr",
)
# Get URL for the uploaded file
signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)


In [None]:
# Process PDF with OCR, including embedded images
pdf_response = client.ocr.process(
    document=DocumentURLChunk(document_url=signed_url.url),
    model="mistral-ocr-latest",
    include_image_base64=True
)

# Convert response to JSON format
response_dict = json.loads(pdf_response.model_dump_json())

with open('mistral.json', 'w', encoding="utf-8") as f:
    f.write(json.dumps(response_dict, indent=2, ensure_ascii=False))