In [1]:
import os

from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

# args
pdf_file_name = "abc.pdf"  # replace with the real pdf path
name_without_suff = pdf_file_name.split(".")[0]

# prepare env
local_image_dir, local_md_dir = "output/images", "output"
image_dir = str(os.path.basename(local_image_dir))

os.makedirs(local_image_dir, exist_ok=True)

image_writer, md_writer = FileBasedDataWriter(local_image_dir), FileBasedDataWriter(
    local_md_dir
)

# read bytes
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(pdf_file_name)  # read the pdf content

# proc
## Create Dataset Instance
ds = PymuDocDataset(pdf_bytes)

## inference
if ds.classify() == SupportedPdfParseMethod.OCR:
    infer_result = ds.apply(doc_analyze, ocr=True)

    ## pipeline
    pipe_result = infer_result.pipe_ocr_mode(image_writer)

else:
    infer_result = ds.apply(doc_analyze, ocr=False)

    ## pipeline
    pipe_result = infer_result.pipe_txt_mode(image_writer)

### draw model result on each page
infer_result.draw_model(os.path.join(local_md_dir, f"{name_without_suff}_model.pdf"))

### get model inference result
model_inference_result = infer_result.get_infer_res()

### draw layout result on each page
pipe_result.draw_layout(os.path.join(local_md_dir, f"{name_without_suff}_layout.pdf"))

### draw spans result on each page
pipe_result.draw_span(os.path.join(local_md_dir, f"{name_without_suff}_spans.pdf"))

### get markdown content
md_content = pipe_result.get_markdown(image_dir)

### dump markdown
pipe_result.dump_md(md_writer, f"{name_without_suff}.md", image_dir)

### get content list content
content_list_content = pipe_result.get_content_list(image_dir)

### dump content list
pipe_result.dump_content_list(md_writer, f"{name_without_suff}_content_list.json", image_dir)

### get middle json
middle_json_content = pipe_result.get_middle_json()

### dump middle json
pipe_result.dump_middle_json(md_writer, f'{name_without_suff}_middle.json')

  from .autonotebook import tqdm as notebook_tqdm
[32m2025-03-21 09:26:52.075[0m | [1mINFO    [0m | [36mmagic_pdf.data.dataset[0m:[36m__init__[0m:[36m156[0m - [1mlang: None[0m


[93mimport tensorrt_llm failed, if do not use tensorrt, ignore this message[0m
[93mimport lmdeploy failed, if do not use lmdeploy, ignore this message[0m


[32m2025-03-21 09:26:52.352[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m67[0m - [1mcid_count: 0, text_len: 8, cid_chars_radio: 0.0[0m
[32m2025-03-21 09:26:52.354[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m78[0m - [1mDocAnalysis init, this may take some times, layout_model: doclayout_yolo, apply_formula: True, apply_ocr: True, apply_table: True, table_model: rapid_table, lang: None[0m
[32m2025-03-21 09:26:52.354[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m92[0m - [1musing device: cpu[0m
[32m2025-03-21 09:26:52.355[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m96[0m - [1musing models_dir: /home/du/.cache/modelscope/hub/models/opendatalab/PDF-Extract-Kit-1___0/models[0m


CustomVisionEncoderDecoderModel init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
VariableUnimerNetModel init
VariableUnimerNetPatchEmbeddings init
CustomMBartForCausalLM init
CustomMBartDecoder init


2025-03-21 09:26:56,289 - DownloadModel - DEBUG: /home/du/anaconda3/envs/mineru/lib/python3.10/site-packages/rapid_table/models/slanet-plus.onnx already exists
[2025-03-21 09:26:56,289] [   DEBUG] download_model.py:34 - /home/du/anaconda3/envs/mineru/lib/python3.10/site-packages/rapid_table/models/slanet-plus.onnx already exists
E0321 09:26:56.426005 12853 analysis_config.cc:169] Please use PaddlePaddle with GPU version.
E0321 09:26:56.538704 12853 analysis_config.cc:169] Please use PaddlePaddle with GPU version.
E0321 09:26:56.633754 12853 analysis_config.cc:169] Please use PaddlePaddle with GPU version.
[32m2025-03-21 09:26:56.735[0m | [1mINFO    [0m | [36mmagic_pdf.model.pdf_extract_kit[0m:[36m__init__[0m:[36m174[0m - [1mDocAnalysis init done![0m
[32m2025-03-21 09:26:56.735[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mcustom_model_init[0m:[36m128[0m - [1mmodel init cost: 4.381706237792969[0m
[32m2025-03-21 09:26:58.285[0m | 