In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.append("src")

#from src.local_models.llm import MyLLM, LlamaCPPLLM
from llama_index.llms.openai import OpenAI
from local_models.embeddings import get_embed_model

from data_loader.splitting import split_by_md_headers
from data_loader.parsing import get_html, extract_md_tables

from llama_index.core import Document
from data_loader.chunking import chunk_docs_standalone
from llama_index.core import VectorStoreIndex
from llama_index.core import PromptTemplate

from data_loader.load_from_dir import rebuild_index
from utils import load_prompt
from llama_index.core import PromptTemplate


from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.agent.openai import OpenAIAgent


from dotenv import load_dotenv
import os

In [4]:
load_dotenv(override=True)

True

In [6]:
embed_model = get_embed_model(model_name=os.environ['embed_path'],  model_kwargs={'device': 'cpu'}, encode_kwargs = {'normalize_embeddings': True})
#llm = MyLLM(pretrained_model_name_or_path=os.environ['llm_path'], device_map="mps", context_window=4096, num_output=512, model_name='chatglm3-6b')
llm = OpenAI()

In [None]:
#first-time build
"""
df = split_by_md_headers('data/产品白皮书/KDP-WhitePaper.md')
df['text_html'] = df['content'].apply(lambda x: get_html(x))
df = extract_md_tables(df)

table_docs = [Document(text=tbl, metadata={}) for tbl in df[df['is_table']==1]['table'].values]

text_docs = [Document(text=tbl, metadata={}) for tbl in df[df['is_table']==0]['text'].values]
text_nodes = [chunk_docs_standalone(text_docs[i], chunk_size=512, chunk_overlap=50)[0] for i in range(len(text_docs))]

table_index = VectorStoreIndex.from_documents(table_docs)
text_index = VectorStoreIndex(text_nodes)

#persist to disk
#table_index.storage_context.persist(persist_dir="db_stores/table_index")
#text_index.storage_context.persist(persist_dir='db_stores/text_index')
"""

In [7]:
#load prompt
prompt_str = load_prompt('prompt_bank/whitepaper.txt')
new_vector_tmpl = PromptTemplate(prompt_str)

In [8]:
#rebuild storage context from disk
table_index = rebuild_index("db_stores/table_index")
text_index = rebuild_index("db_stores/text_index")


In [9]:
table_engine = table_index.as_query_engine()
text_engine = text_index.as_query_engine()

text_engine.update_prompts({'response_synthesizer:text_qa_template': new_vector_tmpl})

In [10]:
# define tools
query_engine_tools = [
    QueryEngineTool(
        query_engine=table_engine,
        metadata=ToolMetadata(
            name="table_tool",
            description=(
                "Useful for retriving specific context from tables"
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=text_engine,
        metadata=ToolMetadata(
            name="text_tool",
            description=(
                "Useful for retriving specific context from plain text"
            ),
        ),
    ),
]

In [11]:
# build agent
agent = OpenAIAgent.from_tools(
    query_engine_tools,
    max_function_calls=3,
    verbose=True,
    system_prompt=f"""\
        Respond in Chinese.
        
        If you inquire general-purpose questions, refer to the text_tool as a priority.
        For questions involving numbers and figures, refer to the table_tool first.
        If you need to analyze tech-savvy problems, which demands a higher degree of rigorous reasoning, such as step-by-step instructions, 
        please utilize both the text_tool and table_tool to synthesize your answer.
        
        You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
        """,
)

In [15]:
agent.query('云原生相比传统大数据平台的优势在哪里')

Added user message to memory: 云原生相比传统大数据平台的优势在哪里
=== Calling Function ===
Calling function: text_tool with args: {"input":"云原生相比传统大数据平台的优势在哪里"}
Got output: 云原生相比传统大数据平台的优势在于可以显著提升运维效率，降低运维成本，解放技术团队的生产力。传统大数据平台因为技术扩展迭代流程比较慢，不能及时解决运维中碰到的性能瓶颈，而云原生技术以容器和Kubernetes为代表，能够更快速地部署、升级和管理大数据组件，提高运维效率，降低成本，让技术团队更专注于业务开发和数据价值的发现。



Response(response='云原生相比传统大数据平台的优势在于可以显著提升运维效率，降低运维成本，解放技术团队的生产力。传统大数据平台因为技术扩展迭代流程比较慢，不能及时解决运维中碰到的性能瓶颈，而云原生技术以容器和Kubernetes为代表，能够更快速地部署、升级和管理大数据组件，提高运维效率，降低成本，让技术团队更专注于业务开发和数据价值的发现。', source_nodes=[NodeWithScore(node=TextNode(id_='f0d61784-64ac-4a78-bc5f-dc7e720816c0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='998da1a2-2895-4db2-827a-a6e40f06f238', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='a12c19a5dceb79caf9e922981d7145ee5f65f945205c0db59a6c08b70bb0428b')}, text='传统大数据平台因为技术扩展迭代流程比较慢，不能及时解决运维中碰到的性能瓶颈，同时大数据组件之间软件包依赖很复杂，导致组件升级困难，新的组件集成耗时费力。使用传统大数据平台的技术团队面对运维压力疲于奔命，没有精力专注于业务开发和数据价值的发现。传统大数据平台逐步迁移到云原生大数据平台后，可以显著提升运维效率，降低运维成本，解放技术团队的生产力\n。', start_char_idx=0, end_char_idx=173, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.884591022601091), NodeWithScore(node=TextNode(id_='bfa90f68-f75d-4

In [16]:
agent.query('kdp中hdfs的读写速度是多少')
#agent.chat('kdp中hdfs的读写速度如何')


Added user message to memory: kdp中hdfs的读写速度是多少
=== Calling Function ===
Calling function: text_tool with args: {"input":"kdp中hdfs的读写速度是多少"}
Got output: 根据文档内容无法回答该问题。



Response(response='根据文档内容，无法提供KDP中HDFS的读写速度信息。您可以参考KDP的官方文档或者性能测试报告来获取更详细的信息。', source_nodes=[NodeWithScore(node=TextNode(id_='b6e9eb80-3f22-4c29-894c-bffb18430510', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='eb8568f6-892c-4754-9d72-9f02d15668a3', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='4a6774007d6df8dd5542598a2f3968b644292796b3dafc020580e68041dcdc16'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='8acbd7f9-ef6e-47a5-abd9-9e14fa50fbcb', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='8f3e1b8362d5fb13b0383deea72263fbb9a060944eb70bffa795fda498b3991b')}, text='分布式文件系统HDFS是一个可以运行在普通硬件上的文件系统，它通过将文件分块保存到不同的节点并且每个块保存多个副本的方式来实现高容错性。HDFS存的每个数据块有3个备份，分布在不同的数据节点上，来实现高可用。KDP通过对开源的helm chart进行扩展将HDFS的非云原生特性进行了改造：  \n\n- 原来的基于本地硬盘的存储改造成了基于PV的云原生存储模式。\n- 将host网络改造成了pod的虚拟网络。\n- 实现了datanode的（在硬件环境支持的情况下）弹性扩容。  \n\n原本基于本地硬盘的存储方式被KDP改造成了基于PV的云原生存储模式。这一改

In [17]:
agent.query('kdp中hdfs的版本是多少')

Added user message to memory: kdp中hdfs的版本是多少
=== Calling Function ===
Calling function: table_tool with args: {"input":"KDP中HDFS的版本"}
Got output: 3.1.1



Response(response='KDP中HDFS的版本是3.1.1。', source_nodes=[NodeWithScore(node=TextNode(id_='9d947b6a-267a-4d13-9223-bb79fb6fe7d4', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ddf132db-f2a5-431c-b29d-1042fc039c6d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='54daf0932ee691d230bf064828f0ca9ab462a83f3c15001d3b9e724801cab10b'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9160c668-2300-4e82-90f5-5a0e31e8544e', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='4a06b8f7474c879971bf5d03f0e86676eb279c84ff8fe94b96489361f7e28915')}, text='| 大数据组件 | 版本 |\n| --- | --- |\n| 分布式文件系统HDFS | 3.1.1 |\n| 分布式数据仓库Hive | 3.1.3 |\n| 分布式计算引擎Spark | 3.3.0 |\n| 分布式消息队列Kafka | 2.8.1 |\n| 分布式对象存储MinIO | RELEASE.2022-09-07 |\n| 批流一体计算引擎Flink | 1.14.6 |', start_char_idx=2, end_char_idx=193, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metad