In [1]:
from docling.document_converter import DocumentConverter

import os
import json
import logging
import time
from pathlib import Path

from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.base_models import FigureElement, Table
from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
#from docling.models.ocr_mac_model import OcrMacOptions
#from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
import tesserocr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#print(os.environ.get('TESSDATA_PREFIX'))
#os.environ['TESSDATA_PREFIX'] = '/opt/homebrew/share/tessdata/'
print(tesserocr.tesseract_version())
print(tesserocr.get_languages())

tesseract 5.4.1
 leptonica-1.83.1
  libgif 5.2.1 : libjpeg 8d (libjpeg-turbo 3.0.0) : libpng 1.6.46 : libtiff 4.7.0 : zlib 1.2.13 : libwebp 1.5.0 : libopenjp2 2.5.2
('/Users/yoyo/opt/anaconda3/envs/cathay_llm/share/tessdata/', ['afr', 'amh', 'ara', 'asm', 'aze', 'aze_cyrl', 'bel', 'ben', 'bod', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chi_sim', 'chi_sim_vert', 'chi_tra', 'chi_tra_vert', 'chr', 'cos', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'enm', 'epo', 'equ', 'est', 'eus', 'fao', 'fas', 'fil', 'fin', 'fra', 'frk', 'frm', 'fry', 'gla', 'gle', 'glg', 'grc', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'hye', 'iku', 'ind', 'isl', 'ita', 'ita_old', 'jav', 'jpn', 'jpn_vert', 'kan', 'kat', 'kat_old', 'kaz', 'khm', 'kir', 'kmr', 'kor', 'kor_vert', 'lao', 'lat', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mkd', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nep', 'nld', 'nor', 'oci', 'ori', 'osd', 'pan', 'pol', 'por', 'pus', 'que', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'snd', 'spa', 'spa_old', 'sqi'

In [3]:
# document local path or URL
doc = "./test_doc/test_textMore.pdf" # simple text with a image and table
doc1 = "./test_doc/吉美利101利率變動型美元終身壽險(定期給付型)DM.pdf" # complex text with simple aesthetic table
doc2 = "./test_doc/美利雙寶利率變動型美元終身保險 (定期給付型)DM.pdf" # complex text with complex aesthetic table
doc3 = "./test_doc/國泰金控出勤管理須知(修正後).pdf" # simple text with a simple table
doc4 = "./test_doc/國泰金控員工國內出差要點.pdf" # simple text with a simple table
doc5 = "./test_doc/國泰金控員工國外出差要點.pdf" # simple text with complex table
doc6 = "./test_doc/國泰金融控股股份有限公司資訊安全管理辦法.pdf" # simple text

# DOCLING TESTING


converter = DocumentConverter()
result = converter.convert(doc)

# print result in markdown
print(result.document.export_to_markdown())

# print result in JSON
#print(result.document.export_to_dict())

# save file as MD
file_name = os.path.splitext(os.path.basename(doc))[0]

with open('docling_result_' + file_name + '.txt', 'w', encoding='utf-8') as f:
    f.write(result.document.export_to_markdown())

In [4]:
_log = logging.getLogger(__name__)

In [12]:
# custom conversion

logging.basicConfig(level=logging.INFO)
    
###########################################################################

# The following sections contain a combination of PipelineOptions
# and PDF Backends for various configurations.
# Uncomment one section at the time to see the differences in the output.

# PyPdfium without EasyOCR
# --------------------

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = False
#pipeline_options.table_structure_options.do_cell_matching = True
###
pipeline_options.table_structure_options.mode = 'accurate'

pipeline_options.images_scale = 2
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
        pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
        )
    }
)


# PyPdfium with EasyOCR
# -----------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(
#             pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
#         )
#     }
# )

# PyPdfium with Tesseract
# ----------------------
'''
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
#pipeline_options.table_structure_options.do_cell_matching = False
pipeline_options.ocr_options.lang = ["chi_tra"]
pipeline_options.table_structure_options.mode = 'accurate'

pipeline_options.images_scale = 2
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

pipeline_options.ocr_options = TesseractOcrOptions()

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend)
   }
)
'''

# Docling Parse without EasyOCR
# -------------------------
'''
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
#pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.do_cell_matching = False

pipeline_options.images_scale = 2
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)
'''
# Docling Parse with EasyOCR
# ----------------------
'''
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.ocr_options.lang = ["en"]
pipeline_options.accelerator_options = AcceleratorOptions(
    num_threads=4, device=AcceleratorDevice.AUTO
)

pipeline_options.images_scale = 2
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)
'''
# Docling Parse with EasyOCR (CPU only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.ocr_options.use_gpu = False  # <-- set this.
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with Tesseract
# ----------------------
'''
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
#pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.do_cell_matching = False
pipeline_options.ocr_options.lang = ["chi_tra"]
pipeline_options.table_structure_options.mode = 'accurate'

pipeline_options.images_scale = 2
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

pipeline_options.ocr_options = TesseractOcrOptions()

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
   }
)
'''


# Docling Parse with Tesseract CLI
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = TesseractCliOcrOptions()

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

# Docling Parse with ocrmac(Mac only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = OcrMacOptions()

# doc_converter = DocumentConverter(
#     format_options={
#         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
#     }
# )

###########################################################################

start_time = time.time()
conv_result = doc_converter.convert(doc4)
end_time = time.time() - start_time

_log.info(f"Document converted in {end_time:.2f} seconds.")

## Export results
output_dir = Path("scratch")
output_dir.mkdir(parents=True, exist_ok=True)
doc_filename = conv_result.input.file.stem


# Export Deep Search document JSON format:
with (output_dir / f"{doc_filename}.json").open("w", encoding="utf-8") as fp:
    fp.write(json.dumps(conv_result.document.export_to_dict()))

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_text())

'''
# Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_markdown())
'''

# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w", encoding="utf-8") as fp:
    fp.write(conv_result.document.export_to_document_tokens())

'''
# Save page images
    for page_no, page in conv_result.document.pages.items():
        page_no = page.page_no
        page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
        with page_image_filename.open("wb") as fp:
            page.image.pil_image.save(fp, format="PNG")
'''

# Save images of figures and tables
table_counter = 0
picture_counter = 0
for element, _level in conv_result.document.iterate_items():
    if isinstance(element, TableItem):
        table_counter += 1
        element_image_filename = (
            output_dir / f"{doc_filename}-table-{table_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            try:
                element.get_image(conv_result.document).save(fp, "PNG")
            except:
                pass

    if isinstance(element, PictureItem):
        picture_counter += 1
        element_image_filename = (
            output_dir / f"{doc_filename}-picture-{picture_counter}.png"
        )
        with element_image_filename.open("wb") as fp:
            element.get_image(conv_result.document).save(fp, "PNG")

# Save markdown with embedded pictures
md_filename = output_dir / f"{doc_filename}-with-images.md"
conv_result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.EMBEDDED)

# Save markdown with externally referenced pictures
md_filename = output_dir / f"{doc_filename}-with-image-refs.md"
conv_result.document.save_as_markdown(md_filename, image_mode=ImageRefMode.REFERENCED)

# Save HTML with externally referenced pictures
html_filename = output_dir / f"{doc_filename}-with-image-refs.html"
conv_result.document.save_as_html(html_filename, image_mode=ImageRefMode.REFERENCED)

INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.utils.accelerator_utils:Accelerator device: 'mps'
INFO:docling.pipeline.base_pipeline:Processing document 國泰金控員工國內出差要點.pdf
INFO:docling.document_converter:Finished converting document 國泰金控員工國內出差要點.pdf in 4.96 sec.
INFO:__main__:Document converted in 4.96 seconds.


# UNSTRUCTURE TESTING

In [9]:
from unstructured.partition.pdf import partition_pdf
from unstructured_client.models import operations, shared
from unstructured.staging.base import elements_from_dicts, elements_to_json
from unstructured_client import UnstructuredClient

import base64
from PIL import Image
import io
import webbrowser
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
#print(os.getenv("unstructured_api_key"))


In [10]:
# Returns a List[Element] present in the pages of the parsed pdf document

elements = partition_pdf(
    filename=doc,                                          # mandatory
    strategy="hi_res",                                     # mandatory to use ``hi_res`` strategy
    languages=['chi_tra'],
    extract_images_in_pdf=True,                            # mandatory to set as ``True``
    extract_image_block_types=["Image", "Table"],          # optional
    extract_image_block_to_payload=False,                  # optional
    extract_image_block_output_dir="./scratch_unstructure",  # optional - only works when ``extract_image_block_to_payload=False``
    )
print("\n\n".join([str(el) for el in elements]))

INFO:pikepdf._core:pikepdf C++ to Python logger bridge initialized
INFO:unstructured_inference:Reading PDF for file: ./test_doc/test_textMore.pdf ...


國泰人壽祿美富利率變動型美元終身壽險（定期給付型） （給付項目：祝壽保險金、身故保險金或喪葬費用保險金、完全失能保險金） （本保險提供身故保險金分期定期給付） （本保險為不分紅保險單，不參加紅利分配，並無紅利給付項目） （本契約與以新臺幣收付之人身保險契約間，不得辦理契約轉換） （本商品部分年齡可能發生累積所繳保險費超出身故保險金給付之情形） （申訴電話：市話免費撥打0800-036-599、付費撥打02-2162-6201；傳真：0800-211-568；電子信箱（E-mail）： service@cathaylife.com.tw）

110.08.12國壽字第1100080120號函備查 111.12.01國壽字第1110120022號函備查 112.02.23依111.11.29金管保壽字第1110462568號函修正 113.07.01 依 113.06.27 金管保壽字第 11304921171 號令修正 113.12.31 依 113.09.23 金管保壽字第 1130427324 號函修正

號 令 修正 113042g324 號 函 修 正 批 計 及 其 他 約 定 書 , 均 為 本 你 險 事 人 的 真 意 , 不 得 拘泥 於 所 用 的 文 契約 保險 金 額 保險 費 、 各 項 保險 金 、 貨 幣 , 並 經 記載 於 保險 單 上 者 (本 匯款 銀行 所 收取 之 匯 出 費用 ( 含 所 收取 之 轉 匯 費 用 , 本 項 費用 以 匯款 本 公司 於 本 契約 有 效 期 間 內 之 每 一 利率 (百 分 之 二 點 二 五 ) 之 差 值 每 月 第 一 個 營業 日 窗 竺 ” 適用 於 本 所 累積 資產 的 實際 狀況 而 新 定 。 如 宣告 利率 裝 公 告 於 本 公司 網 站 (www. 平均 值 」 : 指 本 軒 約 保險 單 週年 奉 約 之 預 窟 利率 ss 自 究 補 三 點 二 五 單 所 載 本 奉 約 (不 含 其 他 附 約 、 後 之 金 額 為 準 。 指 就 每 一 保險 單 週年 日 依 第 店 五 條 本 保險 金 額 」 與 上 累計 增加 保險 金

第一條 保險契約的構成

本保險單條款、附著之要保書、批註及其他約定書，均為本保險契約（以下簡稱本契約）的構成部分。 本

In [11]:
from collections import Counter

display(Counter(type(element) for element in elements))

Counter({unstructured.documents.elements.NarrativeText: 89,
         unstructured.documents.elements.ListItem: 66,
         unstructured.documents.elements.Title: 58,
         unstructured.documents.elements.Image: 13,
         unstructured.documents.elements.FigureCaption: 5,
         unstructured.documents.elements.Table: 2,
         unstructured.documents.elements.Header: 1})

In [12]:
display(*[(type(element), element.text) for element in elements[10:13]])

(unstructured.documents.elements.ListItem,
 '四、「宣告利率」：指本公司每月第一個營業日宣告，適用於本契約之當月利率，該利率係參考市場利 率及本公司運用此類商品所累積資產的實際狀況而訂定。如當月未宣告者，以前一月之宣告利率為 當月之宣告利率。本契約宣告利率將公告於本公司網站（www.cathayholdings.com/life）。')

(unstructured.documents.elements.ListItem,
 '五、「前一保險單年度宣告利率平均值」：指本契約保險單週年日當月（不含）起算，往前推算十二個 月之宣告利率平均值或本契約之預定利率（百分之二點二五），兩者較高者為準。')

(unstructured.documents.elements.ListItem,
 '六、「基本保險金額」：指保險單所載本契約（不含其他附約、附加條款、批註條款）之保險金額，如 該金額有所變更時，以變更後之金額為準。')

In [13]:
tables = [el for el in elements if el.category == "Table"]

print(tables[1].text)
print(tables[1].metadata.text_as_html)

保險單年度 可借成數 可借金額上限 第 1 年 60% 第 2~6 年 70% 可借金額上限＝借款當日保單價值準備金 × 可借成數 第 7 年及以後 85%
None


# API

In [14]:
# extract table

if __name__ == "__main__":
    client = UnstructuredClient(
        api_key_auth=os.getenv("unstructured_api_key")
    )

    # Where to get the local file, relative to this .py file.
    local_input_filepath = doc

    # Where to store the retrieved HTML (and the processed JSON), relative to this .py file.
    local_output_filepath = "./scratch_unstructure"

    with open(local_input_filepath, "rb") as f:
        files = shared.Files(
            content=f.read(),
            file_name=local_input_filepath
        )

    request = operations.PartitionRequest(
        partition_parameters=shared.PartitionParameters(
            files=files,
            strategy=shared.Strategy.HI_RES,
            # specify ocr language, or you'll get poor result
            languages=["chi_tra"],
            split_pdf_page=True,
            split_pdf_allow_failed=True,
            split_pdf_concurrency_level=15
        )
    )

    try:
        result = await client.general.partition_async(
            request=request,
            server_url=os.getenv("unstructured_url")
        )

        ###
        if not result.elements:
            print("No elements were returned.")
        ###

        # Provide some minimal CSS for better table readability.
        table_css = "<head><style>table, th, td { border: 1px solid; }</style></head>"

        for element in result.elements:
            if "text_as_html" in element["metadata"]:
                # Surround the element's HTML with basic <html> and <body> tags, and add the minimal CSS.
                html_string = f"<!DOCTYPE html><html>{table_css}<body>{element['metadata']['text_as_html']}</body></html>"

                # Save the element's HTML to a local file.
                save_path = f"{local_output_filepath}/{element['element_id']}.html"
                file = open(save_path, 'w')
                file.write(html_string)
                file.close()

                # View the locally saved file in the local default web browser.
                webbrowser.open_new(f"file:///{os.getcwd()}/{save_path}")

        # Also get the elements for inspection and validation.
        dict_elements = elements_from_dicts(
            element_dicts=result.elements
        )

        # Save the elements as JSON.
        elements_to_json(
            elements=dict_elements,
            indent=2,
            filename=f"{local_output_filepath}/embedded-images-tables.json"
        )
    except Exception as e:
        print(e)

INFO:httpx:HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
0:108: execution error: 找不到檔案「部分物件」。 (-43)
0:108: execution error: 找不到檔案「部分物件」。 (-43)
0:108: execution error: 找不到檔案「部分物件」。 (-43)


In [15]:
# extract table and image as png

if __name__ == "__main__":
    client = UnstructuredClient(
        api_key_auth=os.getenv("unstructured_api_key")
    )

    # Where to get the local file, relative to this .py file.
    local_input_filepath = doc

    # Where to store the retrieved HTML (and the processed JSON), relative to this .py file.
    local_output_filepath = "./scratch_unstructure"

    with open(local_input_filepath, "rb") as f:
        files = shared.Files(
            content=f.read(),
            file_name=local_input_filepath
        )

    request = operations.PartitionRequest(
        partition_parameters=shared.PartitionParameters(
            files=files,
            strategy=shared.Strategy.HI_RES,
            languages=["chi_tra"],
            split_pdf_page=True,
            split_pdf_allow_failed=True,
            split_pdf_concurrency_level=15,
            # Extract the Base64-encoded representation of each
            # processed "Image" and "Table" element. Extract each into
            # an "image_base64" object, as a child of the
            # "metadata" object, for that element in the result.
            # Element type names, such as "Image" and "Table" here,
            # are case-insensitive.
            # Any available Unstructured element type is allowed.
            extract_image_block_types=["Image", "Table"]
        )
    )

    try:
        result = await client.general.partition_async(
            request=request,
            server_url=os.getenv("unstructured_url")
        )

        for element in result.elements:
            if "image_base64" in element["metadata"]:
                # Decode the Base64-encoded representation of the 
                # processed "Image" or "Table" element into its original
                # visual representation, and then show it.
                image_data = base64.b64decode(element["metadata"]["image_base64"])
                image = Image.open(io.BytesIO(image_data))
                image.show()
                image_filename = f"{local_output_filepath}/{element['element_id']}.png"
                image.save(image_filename)

    except Exception as e:
        print(e)

INFO:httpx:HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"


In [16]:
# extract the whole file and output as html

if __name__ == "__main__":
    client = UnstructuredClient(
        api_key_auth=os.getenv("unstructured_api_key")
    )

    # Where to get the local file, relative to this .py file.
    local_input_filepath = doc

    # Where to store the retrieved HTML (and the processed JSON), relative to this .py file.
    local_output_filepath = "./scratch_unstructure"

    with open(local_input_filepath, "rb") as f:
        files = shared.Files(
            content=f.read(),
            file_name=local_input_filepath
        )

    request = operations.PartitionRequest(
        partition_parameters=shared.PartitionParameters(
            files=files,
            strategy=shared.Strategy.HI_RES,
            languages=["chi_tra"],
            split_pdf_page=True,
            split_pdf_allow_failed=True,
            split_pdf_concurrency_level=15,
            extract_image_block_types=["Image", "Table"]
        )
    )

    try:
        result = await client.general.partition_async(
            request=request,
            server_url=os.getenv("unstructured_url")
        )

        if result.elements:
            # Provide some minimal CSS for better table readability.
            table_css = "<head><style>table, th, td { border: 1px solid; }</style></head>"
            
            # Create an HTML string with all elements
            html_content = f"<!DOCTYPE html><html>{table_css}<body>"
            
            for element in result.elements:
                if "text_as_html" in element["metadata"]:
                    html_content += element["metadata"]["text_as_html"]
                elif element["type"] == "Image":
                    # For images, create an img tag with the base64 encoded image data
                    img_data = element["metadata"].get("image_base64", "")
                    html_content += f'<img src="data:image/png;base64,{img_data}" />'
                else:
                    # For other elements, just add the text content
                    html_content += f"<p>{element['text']}</p>"
            
            html_content += "</body></html>"
            
            # Save the HTML content to a file
            save_path = f"{local_output_filepath}/output.html"
            with open(save_path, 'w', encoding='utf-8') as file:
                file.write(html_content)
            
            # Open the saved HTML file in the default web browser
            webbrowser.open_new(f"file:///{os.path.abspath(save_path)}")
        else:
            print("No elements were returned.")
    except Exception as e:
        print(e)

INFO:httpx:HTTP Request: GET https://api.unstructuredapp.io/general/docs "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.unstructured.io/general/v0/general "HTTP/1.1 200 OK"
0:80: execution error: 找不到檔案「部分物件」。 (-43)


# PyMUPDF Testing

In [17]:
import pymupdf4llm

In [18]:
# Extract file as MD
md_text = pymupdf4llm.to_markdown(doc5)
Path("output.md").write_bytes(md_text.encode())

Processing ./test_doc/國泰金控員工國外出差要點.pdf...


13849

In [19]:
# Extract images & Chunking with metadata

md_text_images = pymupdf4llm.to_markdown(
    doc = doc5,
    #pages = list(range(0,7)),
    page_chunks = True,
    write_images = True,
    image_path = "images",
    image_format = "png",
    dpi=300
)

Processing ./test_doc/國泰金控員工國外出差要點.pdf...
