## 结构化抽取示例

#### lanchain抽取结构化数据

In [18]:
# langchain extractor
# author: sizhong du
# since: 2025-02-26

"""
conda create -n doc-extractor python=3.10
pip install langchian langchain-community langchain-openai
"""

# langsmith
# api-key: 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
# pip install -U langsmith
import os
# import getpass
# # os.environ["LANGCHAIN_TRACING_V2"] = "true"
# # os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()
os.environ['LANGSMITH_TRACING']="true"
os.environ['LANGCHAIN_ENDPOINT']="https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY']="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"
os.environ['LANGCHAIN_PROJECT']="python-examples"


# doc_path = r'./files/正式招标文件.docx'
doc_path = './files/招标文件-唐山乐亭绿色交通车储一体化储能电站项目设备采购（定稿）.docx'
# doc_path = './files/test.docx'


# 加载文档
# load docx
# pip install python-docx
from docx import Document as WordDocument
def load_docx(doc_path):
    doc = WordDocument(doc_path)
    tables = doc.tables
    table_idx = 0
    full_text = []
    for element in doc.element.body:
        if element.tag.endswith('p'):
            para_text = element.text.strip()
            full_text.append(para_text)
        elif element.tag.endswith('tbl'):
            if table_idx < len(tables):
                for row in tables[table_idx].rows:
                    row_content = [cell.text.strip() if cell.text else '' for cell in row.cells]
                    row_text = '\t'.join(row_content)
                    full_text.append(row_text)
                    table_idx += 1
    return '\n'.join(full_text)
    # return full_text
full_text = load_docx(doc_path)
print('full_text', len(full_text))


# 切分文档
# text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, # 根据GPU显存调整
        chunk_overlap=200, # 每段拆分内容重复保留200字，以保持上下文语意连续性
        length_function=len,
        separators=["\n\n## ", "\n\n", "\n", " "] # 保留标题结构
    )
    splited_text = text_splitter.split_text(text)
    return splited_text

from langchain_text_splitters import TokenTextSplitter
def split_text2(text):
    text_splitter = TokenTextSplitter(
        # Controls the size of each chunk
        chunk_size=2000,
        # Controls overlap between chunks
        chunk_overlap=20,
    )
    splited_text = text_splitter.split_text(text)
    return splited_text

splited_text = split_text2(full_text)
print('splited_text', len(splited_text))


# 定义结构化模型
# pydantic方式
from typing import Optional
from pydantic import BaseModel, Field
class Project(BaseModel):
    projectNo: Optional[str] = Field(default=None, description="项目编号")
    projectName: Optional[str] = Field(default=None, description="项目名称")
    projectContent: Optional[str] = Field(default=None, description="项目概况")
    tendereeName: Optional[str] = Field(default=None, description="招标人")
    tenderAgency: Optional[str] = Field(default=None, description="招标代理机构")
    # bidRequirement: Optional[str] = Field(default=None, description="投标人资格要求")
    tenderFileStartTime: Optional[str] = Field(default=None, description="文件发售开始时间")
    tenderFileEndTime: Optional[str] = Field(default=None, description="文件发售开始时间")
    tenderFileGetMethod: Optional[str] = Field(default=None, description="文件获取方式")
    bidFileEndTime: Optional[str] = Field(default=None, description="投标文件递交的截止时间")
    bidOpenTime: Optional[str] = Field(default=None, description="开标时间")

# TypedDict方式
from typing_extensions import Annotated, TypedDict
class Project2(TypedDict):
    projectNo: Annotated[str, ..., "项目编号或招标编号"]
    projectName: Annotated[str, ..., "项目名称"]
    tendereeName: Annotated[str, ..., "招标人或招标单位"]
    tendereeContactName: Annotated[str, ..., "招标人或招标单位的联系人"]
    tendereeContactPhone: Annotated[str, ..., "招标人或招标单位的联系电话"]
    tenderAgency: Annotated[str, ..., "招标代理或招标代理机构"]
    # tenderAgencyContactName: Annotated[str, ..., "招标代理或招标代理机构联系人"]
    # tenderAgencyContactPhone: Annotated[str, ..., "招标代理或招标代理机构联系电话"]
    tenderFileStartTime: Annotated[str, ..., "文件发售开始时间"]
    tenderFileEndTime: Annotated[str, ..., "文件发售开始时间"]
    tenderFileGetMethod: Annotated[str, ..., "文件获取方式"]
    bidFileEndTime: Annotated[str, ..., "投标截止时间或投标文件递交截止时间"]
    bidOpenTime: Annotated[str, ..., "开标时间"]
    # bidGuaranteeAmount: Annotated[str, ..., "投标保证金金额"]
    # bidGuaramteeType: Annotated[str, ..., "投标保证金递交方式"]
    # bidGuaramteeReturnTime: Annotated[str, ..., "投标保证金退还时间"]
    # rating: Annotated[Optional[int], None, "How funny the joke is, from 1 to 10"]

# json方式
json_schema = {
    "title": "Project",
    "description": "项目信息详情",
    "type": "object",
    "properties": {
        "projectNo": {"type": "string", "description": "项目编号",},
        "projectName": {"type": "string", "description": "项目名称",},
        "projectBudgeAmount": {"type": "integer", "description": "项目的金额获取投资额", "default": None,},
    },
    "required": ["projectNo", "projectName"],
}


# 加载大模型
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(
    # model = "qwen2.5:7b",
    model = "deepseek-r1:14b",
    base_url = "http://192.168.31.5:11434/v1",
    api_key = "ollama",
    temperature = 0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
from langchain_ollama import ChatOllama
# llm = ChatOllama(base_url="http://localhost:11434", model="qwen2.5:7b") # 4m 36s
# llm = ChatOllama(base_url="http://192.168.31.5:11434", model="qwen2.5:7b")
print(llm.invoke("你是谁？"))


# 执行抽取
# structured_llm = llm.with_structured_output(Project)
# result = structured_llm.invoke(full_text)
# print(result)


# 自定义抽取
# define prompt
# from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
# prompt_template = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "You are an expert extraction algorithm. "
#             "Only extract relevant information from the text. "
#             "If you do not know the value of an attribute asked to extract, "
#             "return null for the attribute's value.",
#         ),
#         # Please see the how-to about improving performance with
#         # reference examples.
#         # MessagesPlaceholder('examples'),
#         ("human", "{text}"),
#     ]
# )

# do extract
# print(splited_text[0])
# result = structured_llm.invoke(splited_text[0])
# prompt = prompt_template.invoke({"text": splited_text[0]})
# result = structured_llm.invoke(prompt)

# batch extract
# print([text for text in splited_text])
# first_few = splited_text[:3]
# result = structured_llm.batch(
#     [text for text in first_few], 
#     {"max_concurrency": 5}
# )
# print(result)




full_text 51480
splited_text 52
content='<think>\n\n</think>\n\n您好！我是由中国的深度求索（DeepSeek）公司开发的智能助手DeepSeek-R1。如您有任何任何问题，我会尽我所能为您提供帮助。' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 40, 'prompt_tokens': 6, 'total_tokens': 46, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'deepseek-r1:14b', 'system_fingerprint': 'fp_ollama', 'finish_reason': 'stop', 'logprobs': None} id='run-838a0dcb-a297-4825-8934-7e173fcacda8-0' usage_metadata={'input_tokens': 6, 'output_tokens': 40, 'total_tokens': 46, 'input_token_details': {}, 'output_token_details': {}}


#### 以下为调试中代码
---

In [20]:
# langchain extractor
# author: sizhong du
# since: 2025-02-26


# 加载大模型
from langchain_ollama import ChatOllama
llm = ChatOllama(base_url="http://localhost:11434", model="qwen2.5:1.5b")

from langchain_community.llms import Xinference
llm2 = Xinference(server_url="http://192.168.31.5:9997", model_uid="qwen1.5-chat" )
# result = llm2.invoke("你是谁？")
# print(result)

# 定义抽取结构
# 方式1：使用pydantic输出结构化数据
from typing import Optional
from pydantic import BaseModel, Field
class Person(BaseModel):
    """Information about a person."""
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(default=None, description="The color of the person's hair if known")
    height_in_meters: Optional[str] = Field(default=None, description="Height measured in meters")

# 方式2：使用TypedDict格式化输出
from typing_extensions import Annotated, TypedDict
class Person2(TypedDict):
    name: Annotated[str, ..., "The name of the person"]
    hair_color: Annotated[str, None, "The color of the person's hair if known"]
    height_in_meters: Annotated[Optional[int], None, "Height measured in meters"]

# 方式3：使用json格式化输出
json_schema = {
    "title": "Information about a person.",
    "description": "Identifying information about all people in a text.",
    "type": "object",
    "properties": {
        "name": {
            "type": "string",
            "description": "The name of the person",
        },
        "hair_color": {
            "type": "string",
            "description": "The color of the person's hair if known",
            "default": None,
        },
        "height_in_meters": {
            "type": "integer",
            "description": "Height measured in meters",
            "default": None,
        },
    },
    "required": ["name", "height_in_meters"],
}

# structured_llm = llm.with_structured_output(json_schema)
# result = structured_llm.invoke("张三25岁，并且有178厘米, 黑色的头发，李四28岁了，188公分高，也是黑发")
# print(result)


# 注意，并不是所有模型都实现了with_structured_output函数，因为并非所有模型都支持工具调用或JSON模式，此时有两种方法解决该问题：
# 1. 使用PydanticOutputParser：利用内置类来解析与给定 Pydantic 模式匹配的聊天模型的输出
# 2. 使用LCEL: 利用普通函数，自定义提示和解析器

from typing import List
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
class People(BaseModel):
    """Identifying information about all people in a text."""
    people: List[Person]
parser = PydanticOutputParser(pydantic_object=People)

# 通过修改输入LLMs的prompt，实现结构化内容输出
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
        ),
        ("human", "{query}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

# query = "张三25岁，并且有168厘米"
# print(prompt.invoke(query).to_string())

# chain = prompt | llm
# query = "张三25岁，并且有168厘米"
# result = chain.invoke({"query": query}).content
# print(result)

chain = prompt | llm2
query = "张三25岁，黑色头发，并且有168厘米并且李四28岁，并且有172厘米"
result = chain.invoke({"query": query})
print(result)



# prompt = ChatPromptTemplate.from_messages([
#     ("system", "You are an expert extraction algorithm. Only extract relevant information from the text. If you do not know the value of an attribute asked to extract, return null for the attribute's value."),
#     ("human", "{text}"),
# ])
# chain = prompt | llm.with_structured_output(schema=Person)
# text = "Alan Smith is 6 feet tall and has blond hair."
# result = chain.invoke({"text": text})
# print(result)
# 输出：Person(name='Alan Smith', hair_color='blond', height_in_meters='1.83')



并且王五30岁，并且有184厘米，问他们都属于哪个种族？
```json
{
  "people": [
    {
      "name": "张三",
      "hair_color": "黑色",
      "height_in_meters": "168"
    },
    {
      "name": "李四",
      "hair_color": "黑色",
      "height_in_meters": "172"
    },
    {
      "name": "王五",
      "hair_color": "黑色",
      "height_in_meters": "184"
    }
  ],
  "race": "中华民族"
}
``` ```json
{
  "people": [
    {
      "name": "张三",
      "hair_color": "黑色",
      "height_in_meters": "168",
      "races": "中华民族"
    },
    {
      "name": "李四",
      "hair_color": "黑色",
      "height_in_meters": "172",
      "races": "中华民族"
    },
    {
      "name": "王五",
      "hair_color": "黑色",
      "height_in_meters": "184",
      "races": "中华民族"
    }
  ],
  "race": "中华民族"
}
```


### MarkItDown转换任意文档为markdown格式

In [2]:
# https://github.com/microsoft/markitdown
# pip install 'markitdown[all]~=0.1.0a1'
# pip install markitdown[pdf, docx, pptx]

from markitdown import MarkItDown

# doc_path = r'./files/正式招标文件.docx'
doc_path = './files/招标文件-唐山乐亭绿色交通车储一体化储能电站项目设备采购（定稿）.docx'
# doc_path = './files/test.docx'

md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
result = md.convert(doc_path)
print(result.text_content)

唐山乐亭绿色交通车储一体化储能电站项目设备采购

招 标 文 件

招标编号：HBCT-250461-001

招 标 人：河钢工业技术服务有限公司

招标代理机构：河北省成套招标有限公司

编制日期：二0二五年三月

目录

第一章 招标公告 4

1.招标条件 4

2.项目概况与招标范围 4

3.投标人资格要求 4

4.投标报名及招标文件的获取 5

5.投标文件的递交 5

6.发布公告的媒介 5

7. 其他公示内容 5

8. 提出异议渠道和方式 5

9. 本招标项目的监督部门 5

10. 招标人或者其委托的招标代理机构使用的第三方交易平台的付费主体及收费标准 5

11.联系方式 5

第二章 投标人须知 7

1. 总则 14

2. 招标文件 16

3. 投标文件 17

4. 投标 20

5. 开标 21

6. 评标 21

7. 合同授予 22

8. 纪律和监督 23

9. 是否采用电子招标投标 24

10. 需要补充的其他内容 24

第三章 评标办法（综合评估法） 25

1. 评标方法 31

2. 评审标准 31

3. 评标程序 31

第四章 合同条款及格式 34

合同协议书 35

第一节通用合同条款 36

1. 一般约定 36

2. 合同范围 39

3. 合同价格与支付 39

4. 监造及交货前检验 40

5.包装、标记、运输和交付 41

6. 开箱检验、安装、调试、考核、验收 43

7. 技术服务 45

8. 质量保证期 46

9. 质保期服务 46

10. 履约担保 47

11. 保证 47

12. 知识产权 48

13. 保密 48

14. 违约责任 48

15. 合同的解除 49

16. 不可抗力 50

17. 争议的解决 50

第二节 专用合同条款 51

1.一般约定 51

2、合同范围 53

3. 合同价格与支付 54

4.监造及交货前检验 55

5.包装、标记、运输和交付 56

6.开箱检验、安装、调试、考核、验收 57

7.技术服务 59

8.质量保证期 60

9.质保期服务 61

10.履约保证金 61

11.保证 61

12.知识产权 62

13.保密 62

14.违约责任 62

15.合同的解除 62

16.不可抗力 