In [1]:
!pip list

Package                   Version
------------------------- --------------------
absl-py                   2.1.0
accelerate                1.4.0
aiohappyeyeballs          2.5.0
aiohttp                   3.11.13
aiosignal                 1.3.2
albucore                  0.0.23
albumentations            1.4.24
annotated-types           0.7.0
antlr4-python3-runtime    4.9.3
anyio                     4.6.2
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
astor                     0.8.1
asttokens                 3.0.0
async-lru                 2.0.5
async-timeout             5.0.1
attrdict                  2.0.1
attrs                     24.3.0
babel                     2.16.0
bce-python-sdk            0.9.29
beautifulsoup4            4.12.3
black                     25.1.0
bleach                    6.2.0
blinker                   1.9.0
boto3                     1.37.9
botocore                  1.37.9
braceexpand               0.1.7
Brotli    

## 安装Mineru

## Mineru使用

In [2]:
import os
import logging
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
from tqdm import tqdm

[93mimport tensorrt_llm failed, if do not use tensorrt, ignore this message[0m
[93mimport lmdeploy failed, if do not use lmdeploy, ignore this message[0m


In [8]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger('pdf_processor')

In [9]:
def process_pdf(pdf_path, output_dir):
    """
    Process a PDF file and generate various output files.

    Args:
        pdf_path: Path to the PDF file
        output_dir: Directory where outputs will be saved
    """
    pdf_filename = os.path.basename(pdf_path)
    base_filename = os.path.splitext(pdf_filename)[0]

    logger.info(f"Processing PDF: {pdf_filename}")

    # Prepare directory structure
    images_dir_path = os.path.join(output_dir, "images")
    images_dir_name = os.path.basename(images_dir_path)

    os.makedirs(images_dir_path, exist_ok=True)
    logger.debug(f"Created images directory: {images_dir_path}")

    # Initialize file writers
    image_writer = FileBasedDataWriter(images_dir_path)
    md_writer = FileBasedDataWriter(output_dir)

    # Read PDF content
    pdf_reader = FileBasedDataReader("")
    pdf_bytes = pdf_reader.read(pdf_path)
    logger.debug(f"Read {len(pdf_bytes)} bytes from {pdf_filename}")

    # Process PDF
    dataset = PymuDocDataset(pdf_bytes)
    pdf_type = dataset.classify()
    logger.info(f"Detected PDF type: {pdf_type}")

    # Apply appropriate processing based on PDF type
    if pdf_type == SupportedPdfParseMethod.OCR:
        logger.info(f"Using OCR mode for {pdf_filename}")
        inference_result = dataset.apply(doc_analyze, ocr=True)
        processing_result = inference_result.pipe_ocr_mode(image_writer)
    else:
        logger.info(f"Using text mode for {pdf_filename}")
        inference_result = dataset.apply(doc_analyze, ocr=False)
        processing_result = inference_result.pipe_txt_mode(image_writer)

    # Generate output files
    logger.debug("Generating output files")
    model_pdf_path = os.path.join(output_dir, "model.pdf")
    inference_result.draw_model(model_pdf_path)
    logger.debug(f"Created model visualization: {model_pdf_path}")

    model_inference_result = inference_result.get_infer_res()

    layout_pdf_path = os.path.join(output_dir, "layout.pdf")
    processing_result.draw_layout(layout_pdf_path)
    logger.debug(f"Created layout visualization: {layout_pdf_path}")

    spans_pdf_path = os.path.join(output_dir, "spans.pdf")
    processing_result.draw_span(spans_pdf_path)
    logger.debug(f"Created spans visualization: {spans_pdf_path}")

    # Generate markdown content
    markdown_content = processing_result.get_markdown(images_dir_name)
    markdown_path = f"{base_filename}.md"
    processing_result.dump_md(md_writer, markdown_path, images_dir_name)
    logger.info(f"Created markdown file: {markdown_path}")

    # Generate content list
    content_list = processing_result.get_content_list(images_dir_name)
    processing_result.dump_content_list(md_writer, "content_list.json", images_dir_name)
    logger.debug("Created content list JSON")

    # Generate middle JSON
    middle_json = processing_result.get_middle_json()
    processing_result.dump_middle_json(md_writer, "middle.json")
    logger.debug("Created middle JSON file")

    logger.info(f"Successfully processed {pdf_filename}")


In [10]:
pdf_path="../data/docs/2503.04697v1.pdf"
output_dir="../output/pdf_parse"

In [11]:
process_pdf(pdf_path,output_dir)

[32m2025-03-18 20:41:51.254[0m | [1mINFO    [0m | [36mmagic_pdf.data.dataset[0m:[36m__init__[0m:[36m156[0m - [1mlang: None[0m
[32m2025-03-18 20:41:52.967[0m | [1mINFO    [0m | [36mmagic_pdf.libs.pdf_check[0m:[36mdetect_invalid_chars[0m:[36m67[0m - [1mcid_count: 0, text_len: 30225, cid_chars_radio: 0.0[0m
[32m2025-03-18 20:41:52.979[0m | [1mINFO    [0m | [36mmagic_pdf.model.doc_analyze_by_custom_model[0m:[36mdoc_analyze[0m:[36m180[0m - [1mgpu_memory: 24 GB, batch_ratio: 8[0m
[32m2025-03-18 20:41:55.899[0m | [1mINFO    [0m | [36mmagic_pdf.model.batch_analyze[0m:[36m__call__[0m:[36m74[0m - [1mlayout time: 2.0, image num: 20[0m
[32m2025-03-18 20:41:57.888[0m | [1mINFO    [0m | [36mmagic_pdf.model.batch_analyze[0m:[36m__call__[0m:[36m85[0m - [1mmfd time: 1.99, image num: 20[0m
[32m2025-03-18 20:42:05.186[0m | [1mINFO    [0m | [36mmagic_pdf.model.batch_analyze[0m:[36m__call__[0m:[36m100[0m - [1mmfr time: 7.3, image num: 1