In [1]:
import json
import os

from uniparser_tools.api.clients import UniParserClient
from uniparser_tools.common.constant import ParseMode, ParseModeTextual
from uniparser_tools.tools.caption_extraction.main import Dict, ImageWithCaption, main


###### 以下为示例代码，请自行修改

# ==============================================================================================
# 必须使用
# http://{UNIPARSER_PUBLIC_IP}:30001 [公网]
# http://{UNIPARSER_PRIVATE_IP}:30001 [内网]
# 进行解析，获取token结果后自行处理，其他接口解析的不支持提取图文对

UNIPARSER_PRIVATE_IP = os.getenv('UNIPARSER_PRIVATE_IP')
parser = UniParserClient(user="admin", host=f"http://{UNIPARSER_PRIVATE_IP}:30001")

token = "aeb129da-4a36-4627-9cb2-c721def237f0"
input_file = "./tasks/aeb129da-4a36-4627-9cb2-c721def237f0.pdf"
save_dir = "./outputs/caption_extraction"
os.makedirs(save_dir, exist_ok=True)

## Step.01 提交pdf进行解析并获取解析结果

In [2]:
trigger_result = parser.trigger_file(
    file_path=input_file,
    token=token,
    textual=ParseModeTextual.DigitalExported,
    table=ParseMode.OCRFast,
    molecule=ParseMode.OCRFast,
    chart=ParseMode.DumpBase64,
    figure=ParseMode.DumpBase64,
    expression=ParseMode.DumpBase64,
    equation=ParseMode.OCRFast,
)
if trigger_result["status"] != "success":
    print(json.dumps(trigger_result, indent=4))
    raise Exception("trigger file failed")
print(f"trigger success, token: {trigger_result['token']}")

trigger success, token: aeb129da-4a36-4627-9cb2-c721def237f0


In [3]:
result = parser.get_result(token, pages_dict=True)
if result["status"] != "success":
    print(json.dumps(result, indent=4))
    raise Exception("get result failed")
json.dump(result["pages_dict"], open(f"{save_dir}/{token}.json", "w"), indent=4)


## Step.02 提取图文对

In [4]:
# ==============================================================================================

pdf_path = input_file
json_path = f"{save_dir}/{token}.json"  # get-results 获取的pages dict
save_dir = save_dir  # 自动保存，推荐
# save_dir = None  # 不保存，返回结果，需要手动保存

# 此处可能会打印大量日志,
results = main(
    token=token,
    pdf_path=pdf_path,
    json_path=json_path,
    save_dir=save_dir,
    dpi=300,  # 可以使用低DPI测试，然后再使用高DPI进行提取
    log_level="ERROR",  # debug(完全输出) | info | warning | error(完全静默)
)

if not save_dir and results:
    token = results["token"]
    extract_ratio = results["extract_ratio"]
    extracted: Dict[str, ImageWithCaption] = results["extracted"]
    global_info = results["global_info"]

    save_dir = f"{save_dir}/{token}_manual"
    os.makedirs(save_dir, exist_ok=True)
    for k, item in extracted.items():
        item.main_image.save(os.path.join(save_dir, f"{k}.image.png"))
        item.caption_image.save(os.path.join(save_dir, f"{k}.caption.png"))
        item.group_image.save(os.path.join(save_dir, f"{k}.group.png"))

        item_info = {
            "group_size": item.group_image.size,
            "image_size": item.main_image.size,
            "image_concat_type": item.image_concat_type.value,
            "captions": item.captions,
            "contexts": item.contexts,
            "keywords": item.keywords,
            "subfigures_info": item.subfigures_info,
            "task": item.task,
        }
        json.dump(
            item_info,
            open(os.path.join(save_dir, f"{k}.json"), "w", encoding="utf-8"),
            ensure_ascii=False,
            indent=4,
        )

    with open(os.path.join(save_dir, f"{token}_global_info.json"), "w", encoding="utf-8") as f:
        json.dump(
            global_info,
            f,
            ensure_ascii=False,
            indent=4,
        )