In [31]:
import warnings
warnings.filterwarnings('ignore')
import os, sys
src_path = os.path.join(os.path.abspath('..'), 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

from llama_index.llms.openai import OpenAI
from local_models.embeddings import get_embed_model
from llama_index.core import ServiceContext

from data_loader.splitting import split_by_md_headers, text_2_Document
from data_loader.parsing import MDDF
from data_loader.chunking import chunk_docs_standalone
from data_loader.load_from_dir import rebuild_index

#from llama_index.core import Document
import re
import pandas as pd

from typing import Dict, List
from dotenv import load_dotenv

In [32]:
from llama_index.core import VectorStoreIndex, SummaryIndex
from llama_index.core import PromptTemplate
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.schema import IndexNode

from llama_index.agent.openai import OpenAIAgent
#from llama_index.core.agent import ReActAgent

from llama_index.core.retrievers import RecursiveRetriever
from llama_index.core.response_synthesizers import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from utils import load_prompt

In [33]:
load_dotenv(override=True)

True

## LLM and Embeddings

In [34]:
embed_model = get_embed_model(model_name=os.environ['embed_path'],  model_kwargs={'device': 'cpu'}, encode_kwargs = {'normalize_embeddings': True})
llm = OpenAI()
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)


## Load data as DF and apply chunking to each content cell

In [35]:
#format md as DF
df = split_by_md_headers('../data/第三部分.md')
#construct node mappings
key_words, docs = MDDF(df, [1]).construct_node_mappings(show_progress=False)#remove useless contents

In [36]:
def get_embeddings_from_docs():
    for i, kw in enumerate(key_words):
        vector_index = VectorStoreIndex(docs[kw], embed_model=embed_model)
        vector_index.storage_context.persist(persist_dir=f"../db_stores/doc_agent_vector_index/idx_{i}")
        
        summary_index = SummaryIndex(docs[kw], embed_model=embed_model)
        summary_index.storage_context.persist(persist_dir=f"../db_stores/doc_agent_summary_index/idx_{i}")
        

In [37]:
get_embeddings_from_docs()

### Build Query Engine (Document Agents)

In [52]:
def build_doc_agent_engine(key_words, docs=None, similarity_top_k=None):
    ##Build Document Agent for each Document
    
    #build agents dict
    agents = {}
    nodes = []
    for i, kw in enumerate(key_words):

        #build vector index--first time
        #vector_index = VectorStoreIndex(docs[kw], embed_model=embed_model)
        #vector_index.storage_context.persist(persist_dir="db_stores/doc_agent_vector_index")

        #load from disk
        vector_index = rebuild_index(persist_dir=f'../db_stores/doc_agent_vector_index/idx_{i}', service_context=service_context)


        #build keyword indexfirst time
        #kw_index = KeywordTableIndex.from_docunments(docs[kw])
        #summary_index = SummaryIndex(docs[kw], embed_model=embed_model)
        #summary_index.storage_context.persist(persist_dir="db_stores/doc_agent_summary_index")

        #load from disk
        summary_index = rebuild_index(persist_dir=f'../db_stores/doc_agent_summary_index/idx_{i}', service_context=service_context)



        #define query engines
        vector_query_engine = vector_index.as_query_engine(llm=llm)

        
        #kw_query_engine = kw_index.as_query_engine()
        list_query_engine = summary_index.as_query_engine(llm=llm)


        #define tools
        query_engine_tools = [
            QueryEngineTool(
                query_engine=vector_query_engine,
                metadata=ToolMetadata(
                    name="vector_tool",
                    description=(
                        f"Useful for retrieving specific context from {kw}"
                    )
                )
            ),
            QueryEngineTool(
                #query_engine=kw_query_engine,
                query_engine=list_query_engine,
                metadata=ToolMetadata(
                    name="summary_tool",
                    description=(
                        f"Useful for summarization-wise questions related to {kw}"
                    )
                )
            )
        ]
        

        #build agents
        agent = OpenAIAgent.from_tools(
            query_engine_tools,
            llm=llm,
            embed_model=embed_model,
            verbose=True,
            #output_parser=output_parser, 
            
            system_prompt=f"""\
                Make sure to respond in Chinese.

                You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.
                Please refer to the summary_tool first if you inquire summarization-wise questions about {kw}
                If you need to fetch details about {kw}, please refer to the vector_tool first.

                Respond only in Chinese.

                """,
        )#ReActAgent
        agents[kw] = agent
    
    
        ##Build Composable Retriever over the agents
        #define top-level nodes
        instru =(
                
            "This content contains some introduction to Electric Power Safety Work Procedures of China Southern Power Grid Co., Ltd., Part 3: Special Operations Content"
             "on the following aspect {kw}, "
            f"Use this index if you need to look up specific facts about {kw}, "
            f"Do not use this index if you want to analyze aspects beyond {kw} "
            "Respond only in Chinese."
        )

        node = IndexNode(
            text=instru, index_id=kw, obj= agent
            )
        nodes.append(node)
    
    #define top-level retriever
    top_vector_index = VectorStoreIndex(objects=nodes, embed_model=embed_model)
    query_engine = top_vector_index.as_query_engine(similarity_top_k=similarity_top_k, verbose=True)
    return query_engine


In [53]:
t = build_doc_agent_engine(key_words, similarity_top_k=2)

In [54]:
t.query("测量低压熔断器和水平排列低压母线电流时有什么注意事项")
# 动火区域分为几级？哪些情况下禁止动火作业
# 

[1;3;38;2;11;159;203mRetrieval entering 中国南方电网有限责任公司电力安全工作规程, 第 3 部分 专项作业, 电气测量作业, 使用携带型仪器测量: OpenAIAgent
[0m[1;3;38;2;237;90;200mRetrieving from object OpenAIAgent with query 测量低压熔断器和水平排列低压母线电流时有什么注意事项
[0mAdded user message to memory: 测量低压熔断器和水平排列低压母线电流时有什么注意事项
=== Calling Function ===
Calling function: vector_tool with args: {"input":"测量低压熔断器和水平排列低压母线电流时有什么注意事项"}

[1;3;38;2;11;159;203mRetrieval entering 中国南方电网有限责任公司电力安全工作规程, 第 3 部分 专项作业, 电气测量作业, 高压线路使用接地电阻表测量: OpenAIAgent
[0m[1;3;38;2;237;90;200mRetrieving from object OpenAIAgent with query 测量低压熔断器和水平排列低压母线电流时有什么注意事项
[0mAdded user message to memory: 测量低压熔断器和水平排列低压母线电流时有什么注意事项
=== Calling Function ===
Calling function: vector_tool with args: {"input":"测量低压熔断器和水平排列低压母线电流时有什么注意事项"}
Got output: When measuring low-voltage fuses and horizontal low-voltage busbar currents, it is important to wear insulated gloves. Avoid direct contact with the grounding leads that are disconnected from the ground potential.



Response(response='重要的注意事项包括确保所有测量都在电流互感器的次级侧进行，使用适合所测电流数值的导线截面，佩戴绝缘手套，避免直接接触与接地电位断开连接的接地引线，遵循适当的安全协议如使用绝缘工具、戴安全护目镜、站在绝缘垫上，并保持与带电部件的必要安全距离。', source_nodes=[NodeWithScore(node=TextNode(id_='c1ff7d03-bd0e-47b2-a805-7a9e11e2b97c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='在测量低压熔断器和水平排列低压母线电流时，重要的是确保所有测量都在电流互感器的次级侧进行。此外，用于连接电流回路的导线截面应适合所测电流数值。必须遵循适当的安全协议，如使用绝缘工具、戴绝缘手套和安全护目镜、站在绝缘垫上，并有专人监督工作。记得保持与表1中规定的带电部件的必要安全距离，并在设置测量设备时使用适当的警示标志和屏障。', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'), score=0.7669828411637836), NodeWithScore(node=TextNode(id_='2cf77b11-a2cc-4421-a484-8659fb8d77b0', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='在测量低压熔断器和水平排列低压母线电流时，重要的注意事项包括佩戴绝缘手套，避免直接接触与接地电位断开连接的接地引线。', start_char_idx=None, end_char_idx=None, text_template='{metadata