本教程介绍UniParser Tools的基础用法，包括如何使用它来解析多种文件和格式化文本数据。

本文以 40001 端口提供的UniParser服务为例进行演示。

## 导入依赖

In [1]:
import json
import os

from uniparser_tools.api.clients import UniParserClient
from uniparser_tools.common.constant import FormatFlag, ParseMode, ParseModeTextual
from uniparser_tools.utils.convert import dict2obj


## 初始化

In [2]:
# 目前有多个可用host开启，但是不同host对应功能不完全相同，解析质量也不一样
# 具体请在售后群中咨询相关的的host信息和功能
# 使用时请勿并发数量过大，公开Uni-Parser服务最高仅允许5并发

host = "https://uniparser.dp.tech/"  # 官网

# 替换为你的认证 api key
api_key = os.getenv('UNIPARSER_API_KEY')

# 初始化客户端
parser = UniParserClient(host=host, api_key=api_key)

# 创建一个目录来保存解析结果
save_dir = "./outputs/quick_start"
os.makedirs(save_dir, exist_ok=True)

## 1. PDF文件解析

### 提交解析

In [3]:
# 设置解析文件路径
pdf_path = "./tasks/He_Deep_Residual_Learning_CVPR_2016_paper.pdf"

# 提交解析任务
trigger_file_result = parser.trigger_file(
    pdf_path,
    textual=ParseModeTextual.DigitalExported,
    table=ParseMode.OCRFast,
    molecule=ParseMode.OCRFast,
    chart=ParseMode.DumpBase64,
    figure=ParseMode.DumpBase64,
    expression=ParseMode.DumpBase64,
    equation=ParseMode.OCRFast,
)
if trigger_file_result["status"] != "success":
    print(json.dumps(trigger_file_result, indent=4))
    raise Exception("trigger file failed")
print(trigger_file_result['token'])

41c7700e6fbf5896884ea2f28002a85f


### 获取结果
> 可以持有token多次进行获取
> 
> 可获取不同格式数据以供不用的场景使用
>
> formatted 只对objects和content产生作用，pages_dict和pages_tree不受影响

#### - Content (End2End) 格式输出
> 主要用于获取解析后的文本内容，返回内容为str格式的全文内容，可以用于LLM等场景
> 
> 在同一次格式化输出中，可以设置不同语义的输出模式，例如表格使用HTML进行输出，公式使用LaTeX进行输出，文本使用Markdown进行输出

In [4]:
# 任务提交成功后，会返回一个token，用于获取解析结果
assert trigger_file_result["status"] == "success"
token = trigger_file_result["token"]
for formatted in list(FormatFlag)[1:]:
    # content must be True
    # formatted 只对objects和content产生作用，pages_dict和pages_tree不受影响
    result = parser.get_formatted(
        token,
        content=True,
        objects=False,
        pages_dict=False,
        pages_tree=False,
        molecule_source=False,
        textual=formatted,
        chart=formatted,
        table=formatted,
        molecule=formatted,
        equation=formatted,
        figure=formatted,
        expression=formatted,
    )
    if result["status"] != "success":
        print(f"Get formatted failed for {formatted}, results is: {json.dumps(result, indent=4)}")
        continue

    head, tail = "", ""
    suffix = ""
    if formatted == "latex":
        head = "\documentclass{article}\n\n\\usepackage{booktabs}\n\n\\begin{document}\n"
        tail = "\end{document}"
        suffix = "tex"
    elif formatted == "html":
        head = "<html>\n\n<body>\n"
        tail = "</body>\n\n</html>"
        suffix = "html"
    elif formatted == "markdown":
        suffix = "md"
    elif formatted == "plain":
        suffix = "plain"
    elif formatted == "markup":
        suffix = "txt"
    else:
        print(f"Unknown format: {formatted}")
        continue

    with open(f"{save_dir}/{token}.{suffix}", "w") as f:
        if head:
            f.write(head + "\n")
        try:
            f.write(result["content"])
        except Exception:
            pass
        if tail:
            f.write(tail + "\n")


#### - Objects 格式输出
> 主要用于获取解析后的语义块，返回内容为json格式的全文语义块，可以用于后续的语义分析等场景
> 
> 目前主要用于Uni-Miner标注平台等场景

In [5]:
assert trigger_file_result["status"] == "success"
token = trigger_file_result["token"]
formatted = FormatFlag.Markup
result = parser.get_formatted(
    token,
    content=False,
    objects=True,
    pages_dict=False,
    pages_tree=False,
    molecule_source=False,
    textual=formatted,
    chart=formatted,
    table=formatted,
    molecule=formatted,
    equation=formatted,
    figure=formatted,
    expression=formatted,
)
if result["status"] != "success":
    print(f"Get formatted failed for {formatted}, results is: {json.dumps(result, indent=4)}")


In [6]:
result["objects"][:5]  # 全文的前5个语义块（所有页面合并）

[{'class': 'documenttitle',
  'confidence': 0.888671875,
  'float_xyxy': [0.24928193933823528,
   0.1336669921875,
   0.7216222426470589,
   0.1522216796875],
  'page': 0,
  'str': '\\begin{documenttitle}\nDeep Residual Learning for Image Recognition\n\\end{documenttitle}\n'},
 {'class': 'paragraph',
  'confidence': 0.533203125,
  'float_xyxy': [0.22179457720588236,
   0.1917724609375,
   0.7538488051470589,
   0.2474365234375],
  'page': 0,
  'str': 'Kaiming He Xiangyu Zhang Shaoqing Ren Jian Sun Microsoft Research {kahe, v-xiangz, v-shren, jiansun}@microsoft.com\n'},
 {'class': 'title',
  'confidence': 0.85595703125,
  'float_xyxy': [0.23743393841911764,
   0.284912109375,
   0.3119973575367647,
   0.297119140625],
  'page': 0,
  'str': '\\begin{title}\nAbstract\n\\end{title}\n'},
 {'class': 'paragraph',
  'confidence': 0.97998046875,
  'float_xyxy': [0.0800924862132353,
   0.310546875,
   0.4694967830882353,
   0.5341796875],
  'page': 0,
  'str': 'Deeper neural networks are more di

#### - Pages dict 格式输出
> Uni-Parser 的原始解析格式之一，未经过任何format化，返回内容为json格式的全文语义块，可以用于后续的语义分析等场景
> 
> 内容详尽充分，但是格式较为复杂，需要自行解析，可以自行 重排序 、去重 等操作，是高阶玩法必备

In [7]:
assert trigger_file_result["status"] == "success"
token = trigger_file_result["token"]
formatted = FormatFlag.Plain
result = parser.get_formatted(
    token,
    content=False,
    objects=False,
    pages_dict=True,
    pages_tree=False,
    molecule_source=False,
    textual=formatted,
    chart=formatted,
    table=formatted,
    molecule=formatted,
    equation=formatted,
    figure=formatted,
    expression=formatted,
)
if result["status"] != "success":
    print(f"Get formatted failed for {formatted}, results is: {json.dumps(result, indent=4)}")


In [8]:
result["pages_dict"][0][:5]  # 第一个页面的前5个语义块 (按照页面进行了拆分)

[{'token': '41c7700e6fbf5896884ea2f28002a85f',
  'page': 0,
  'block': 14,
  'bbox': {'x1': 0.24928193933823528,
   'y1': 0.1336669921875,
   'x2': 0.7216222426470589,
   'y2': 0.1522216796875},
  'conf': 0.888671875,
  'page_size': [1224, 1584],
  'type': 'documenttitle',
  'hidden': False,
  'order': 0,
  'lang': 'en',
  'direction': -1,
  'source': '',
  'bboxes': [{'x1': 0.25073202295241015,
    'y1': 0.13367483350965711,
    'x2': 0.30150634167241114,
    'y2': 0.15178872118092546},
   {'x1': 0.30736691344018074,
    'y1': 0.13367483350965711,
    'x2': 0.39463965721379696,
    'y2': 0.15178872118092546},
   {'x1': 0.40049970539566737,
    'y1': 0.13367483350965711,
    'x2': 0.4926251679464103,
    'y2': 0.15178872118092546},
   {'x1': 0.49848539065691383,
    'y1': 0.13367483350965711,
    'x2': 0.5278343250548917,
    'y2': 0.15178872118092546},
   {'x1': 0.5336946973613664,
    'y1': 0.13367483350965711,
    'x2': 0.5961897108289931,
    'y2': 0.15178872118092546},
   {'x1': 0

In [9]:
pages_dict = dict2obj(result["pages_dict"])  # 可以将pages_dict转换为对象，方便后续操作

In [10]:
pages_dict[0][:5]

[TextualResult(token='41c7700e6fbf5896884ea2f28002a85f', page=0, block=14, bbox=BBox(x1=0.24928193933823528, y1=0.1336669921875, x2=0.7216222426470589, y2=0.1522216796875), conf=0.888671875, page_size=[1224, 1584], type='documenttitle', hidden=False, order=0, lang='en', direction=<Direction.Normal: -1>, source='', bboxes=[BBox(x1=0.25073202295241015, y1=0.13367483350965711, x2=0.30150634167241114, y2=0.15178872118092546), BBox(x1=0.30736691344018074, y1=0.13367483350965711, x2=0.39463965721379696, y2=0.15178872118092546), BBox(x1=0.40049970539566737, y1=0.13367483350965711, x2=0.4926251679464103, y2=0.15178872118092546), BBox(x1=0.49848539065691383, y1=0.13367483350965711, x2=0.5278343250548917, y2=0.15178872118092546), BBox(x1=0.5336946973613664, y1=0.13367483350965711, x2=0.5961897108289931, y2=0.15178872118092546), BBox(x1=0.6020501828661152, y1=0.13367483350965711, x2=0.721859801049326, y2=0.15178872118092546)], contents=['Deep', 'Residual', 'Learning', 'for', 'Image', 'Recognition

#### - Pages tree 格式输出
> Uni-Parser 的原始解析格式，比pages dict更加复杂，是带有嵌套关系的树结构
> 
> 未经过任何format化，返回内容为json格式的全文语义块，可以用于后续的语义分析等场景
> 
> 内容详尽充分，但是格式复杂，需要自行解析，可以自行 重排序 、去重 等操作，是高阶玩法必备
>
> 目前40001端口不支持，放在advance中介绍

## 2. 图片文件解析

In [11]:
# 设置解析文件路径
snip_path = "./tasks/0711.2032v1_page20.png"

# 提交解析任务
trigger_snip_result = parser.trigger_snip(
    snip_path,
    textual=ParseModeTextual.OCRFast,
    table=ParseMode.OCRFast,
    molecule=ParseMode.OCRFast,
    chart=ParseMode.DumpBase64,
    figure=ParseMode.DumpBase64,
    expression=ParseMode.DumpBase64,
    equation=ParseMode.OCRFast,
)
if trigger_snip_result["status"] != "success":
    print(json.dumps(trigger_snip_result, indent=4))
    raise Exception("trigger file failed")
print(trigger_snip_result['token'])

632927fcb88c5b329c164b607d97d5af


In [12]:
# 任务提交成功后，会返回一个token，用于获取解析结果
assert trigger_snip_result["status"] == "success"
token = trigger_snip_result["token"]
for formatted in list(FormatFlag)[1:]:
    # content must be True
    # formatted 只对objects和content产生作用，pages_dict和pages_tree不受影响
    result = parser.get_formatted(
        token,
        content=True,
        objects=False,
        pages_dict=False,
        pages_tree=False,
        molecule_source=False,
        textual=formatted,
        chart=formatted,
        table=formatted,
        molecule=formatted,
        equation=formatted,
        figure=formatted,
        expression=formatted,
    )
    if result["status"] != "success":
        print(f"Get formatted failed for {formatted}, results is: {json.dumps(result, indent=4)}")
        continue

    head, tail = "", ""
    suffix = ""
    if formatted == "latex":
        head = "\documentclass{article}\n\n\\usepackage{booktabs}\n\n\\begin{document}\n"
        tail = "\end{document}"
        suffix = "tex"
    elif formatted == "html":
        head = "<html>\n\n<body>\n"
        tail = "</body>\n\n</html>"
        suffix = "html"
    elif formatted == "markdown":
        suffix = "md"
    elif formatted == "plain":
        suffix = "plain"
    elif formatted == "markup":
        suffix = "txt"
    else:
        print(f"Unknown format: {formatted}")
        continue

    with open(f"{save_dir}/{token}.{suffix}", "w") as f:
        if head:
            f.write(head + "\n")
        try:
            f.write(result["content"])
        except Exception:
            pass
        if tail:
            f.write(tail + "\n")


## 3. PDF URL 解析

In [13]:
# 设置解析文件url 【外网pdf可能会出现网络问题，不建议，必要时可以设置proxy】
pdf_url = "https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf"


# 提交解析任务
trigger_url_result = parser.trigger_url(
    pdf_url,
    textual=ParseModeTextual.OCRFast,
    table=ParseMode.OCRFast,
    molecule=ParseMode.OCRFast,
    chart=ParseMode.DumpBase64,
    figure=ParseMode.DumpBase64,
    expression=ParseMode.DumpBase64,
    equation=ParseMode.OCRFast,
    proxy=None,
)
if trigger_url_result["status"] != "success":
    print(json.dumps(trigger_url_result, indent=4))
    print(trigger_url_result['traceback'])
    raise Exception("trigger file failed")

In [14]:
# 任务提交成功后，会返回一个token，用于获取解析结果
assert trigger_url_result["status"] == "success"
token = trigger_url_result["token"]
for formatted in list(FormatFlag)[1:]:
    # content must be True
    # formatted 只对objects和content产生作用，pages_dict和pages_tree不受影响
    result = parser.get_formatted(
        token,
        content=True,
        objects=False,
        pages_dict=False,
        pages_tree=False,
        molecule_source=False,
        textual=formatted,
        chart=formatted,
        table=formatted,
        molecule=formatted,
        equation=formatted,
        figure=formatted,
        expression=formatted,
    )
    if result["status"] != "success":
        print(f"Get formatted failed for {formatted}, results is: {json.dumps(result, indent=4)}")
        continue

    head, tail = "", ""
    suffix = ""
    if formatted == "latex":
        head = "\documentclass{article}\n\n\\usepackage{booktabs}\n\n\\begin{document}\n"
        tail = "\end{document}"
        suffix = "tex"
    elif formatted == "html":
        head = "<html>\n\n<body>\n"
        tail = "</body>\n\n</html>"
        suffix = "html"
    elif formatted == "markdown":
        suffix = "md"
    elif formatted == "plain":
        suffix = "plain"
    elif formatted == "markup":
        suffix = "txt"
    else:
        print(f"Unknown format: {formatted}")
        continue

    with open(f"{save_dir}/{token}.{suffix}", "w") as f:
        if head:
            f.write(head + "\n")
        try:
            f.write(result["content"])
        except Exception:
            pass
        if tail:
            f.write(tail + "\n")
