In [19]:
# langchain rag
# author: sizhong du
# since: 2025-03-03


doc_path = './files/招标文件-唐山乐亭绿色交通车储一体化储能电站项目设备采购（定稿）.docx'

# load docx
# pip install python-docx
from docx import Document as WordDocument
def load_docx(doc_path):
    doc = WordDocument(doc_path)
    tables = doc.tables
    table_idx = 0
    full_text = []
    for element in doc.element.body:
        if element.tag.endswith('p'):
            para_text = element.text.strip()
            full_text.append(para_text)
        elif element.tag.endswith('tbl'):
            if table_idx < len(tables):
                for row in tables[table_idx].rows:
                    row_content = [cell.text.strip() if cell.text else '' for cell in row.cells]
                    row_text = '\t'.join(row_content)
                    full_text.append(row_text)
                    table_idx += 1
    return '\n'.join(full_text)
    # return full_text
full_text = load_docx(doc_path)
print('full_text', len(full_text))


# split text
from langchain.text_splitter import RecursiveCharacterTextSplitter
def split_text(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000, # 根据GPU显存调整
        chunk_overlap=200, # 每段拆分内容重复保留200字，以保持上下文语意连续性
        length_function=len,
        separators=["\n\n## ", "\n\n", "\n", " "] # 保留标题结构
    )
    splited_text = text_splitter.split_text(text)
    return splited_text
splited_text = split_text(full_text)
print('splited_text', len(splited_text))


# embedding
# pip install xinference_client
from langchain_community.embeddings import XinferenceEmbeddings
embeddings = XinferenceEmbeddings(
    server_url="http://192.168.31.5:9997",
    model_uid="bce-embedding-base_v1",
)

# vector store
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore.from_texts(
    splited_text,
    embedding=embeddings,
)

# retrieve检索
retriever = vector_store.as_retriever(
    # search_type="mmr",
    # search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5},
)
retrieved_text = retriever.invoke("投标截止时间")
print('retrieved_text', len(retrieved_text))

# RAG
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

template = """基于以下上下文：
{context}

问题：{question}
"""
prompt = ChatPromptTemplate.from_template(template)

# pip install xinference_client
from langchain_community.llms import Xinference
llm = Xinference(
    server_url="http://192.168.31.5:9997",
    model_uid="qwen1.5-chat",
    stream=True
)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

response = chain.invoke("投标截止时间是多少？")
print(response)






full_text 51480
splited_text 69
retrieved_text 4
答案：

投标截止时间是2025年3月28日09时00分（北京时间）。
