In [0]:
# Core deps
%pip install -qqq python-dotenv llama-index openai
# Optional embedding + doc processing
%pip install -qqq llama-index-embeddings-openai torch transformers
# Vector store
%pip install -qqq llama-index-vector-stores-chroma docx2txt python-pptx Pillow

In [0]:
%restart_python

In [0]:
import warnings
from dotenv import load_dotenv
import mlflow
warnings.filterwarnings("ignore")
mlflow.autolog(disable=True)

In [0]:
RAG_FILES_DIRECTORY= "/Volumes/development/fiaa_qa/rag_data/sample"
CHRONOS_DB = "./chronos_db"
_ = load_dotenv()

In [0]:
import nest_asyncio
nest_asyncio.apply()

In [0]:
#Configure LLM Details
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
Settings.embed_model=embed_model
from llama_index.llms.openai import OpenAI
llm = OpenAI(model='gpt-3.5-turbo')
print(f"Current embedding model: {Settings.embed_model}")
print(f"Current chat model: {Settings.llm.model}")

In [0]:
import chromadb
import pickle
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

db = chromadb.PersistentClient(path= "./chronos_db/")
chroma_collection = db.get_or_create_collection("otc_fds")
persist_directory = "./chronos_db/"
vector_store = ChromaVectorStore( chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [0]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import QueryEngineTool, FunctionTool
from llama_index.core.vector_stores import MetadataFilters, FilterCondition
from typing import List, Optional
import re, pickle

def sanitize_name(name: str) -> str:
    '''Sanitize the name to match the expected pattern'''
    return re.sub(r'[^a-zA-Z0-9_-]', '_', name)

def get_doc_tools(
    file_path:str,
    name: str
)-> str:
    '''Get Vector query and summary query tools from a document'''
    
    
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
    # proccessed_documents = [handle_surrogates(doc.text) for doc in documents]
    splitter = SentenceSplitter(chunk_size=1024)
    nodes = splitter.get_nodes_from_documents(documents)
    vector_index = VectorStoreIndex(nodes, storage_context=storage_context)
    name = sanitize_name(name)
    
    def vector_query(query:str,
                     page_number:Optional[List[str]] = None
                     )->str:
        """Use to answer questions over a given paper.
    
        Useful if you have specific questions over the paper.
        Always leave page_numbers as None UNLESS there is a specific page you want to search for.
    
        Args:
            query (str): the string query to be embedded.
            page_numbers (Optional[List[str]]): Filter by set of pages. Leave as NONE 
                if we want to perform a vector search
                over all pages. Otherwise, filter by the set of specified pages.
        
        """
        page_numbers = page_numbers or [None]
        metadata_dicts=[
            {"key":"page_label", "value": page_number} for page_number in page_numbers

        ]
        query_engine = vector_index.as_query_engine(
            similarity_top_k=2,
            filters=MetadataFilters.from_dicts(
                metadata_dicts,
                condition = FilterCondition.OR
           )
        )
        response = query_engine.query(query)
        return response
        
    vector_query_tool = FunctionTool.from_defaults(
        name=f'vector_tool_{name}',
        fn=vector_query,
    )
    summary_index = SummaryIndex(nodes, storage_context=storage_context)
    summary_query_engine = summary_index.as_query_engine(
        response_mode="tree_summarize",
        use_async=True
    )
    summary_tool = QueryEngineTool.from_defaults(
        name=f'summary_tool_{name}',
        query_engine=summary_query_engine,
        description=(f'Useful for summarization questions related to {name}')
        )
    # Save VectorStoreIndex to a file
    with open('vector_index.pkl', 'wb') as f:
        pickle.dump(vector_index, f)

    # Save SummaryIndex to a file
    with open('summary_index.pkl', 'wb') as f:
        pickle.dump(summary_index, f)
        
    return vector_query_tool, summary_tool


In [0]:
def create_tools_from_directory(directory_path: str):
    all_tools = []
    directory  = Path(directory_path)
    for file_path in directory.glob("*"):
        name = file_path.stem
        print (f'Getting tools for paper : {name}')
        vector_tool, summary_tool = get_doc_tools(file_path, name)
        all_tools.extend([vector_tool, summary_tool])
    return all_tools


In [0]:
from pathlib  import Path
directory = RAG_FILES_DIRECTORY
tools_list = create_tools_from_directory(directory)
print (tools_list)

In [0]:
print (tools_list)

In [0]:
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex

obj_index = ObjectIndex.from_objects(
    tools_list,
    index_cls=VectorStoreIndex,
    storage_context=storage_context
    )
# Save ObjectIndex to a file
with open('obj_index.pkl', 'wb') as f:
    pickle.dump(obj_index, f)  

In [0]:
# Create the ObjectIndex
print (obj_index)

In [0]:
obj_retreiver = obj_index.as_retriever(similarity_top_k=5)

In [0]:
tools = obj_retreiver.retrieve("Tell me about FOH")


In [0]:
tools[1].metadata

In [0]:
from llama_index.core.agent import FunctionCallingAgentWorker, AgentRunner

agent_worker = FunctionCallingAgentWorker.from_tools(
    llm=llm,
    tool_retriever=obj_retreiver,
    system_prompt=""" \
You are an agent designed to answer queries over a set of given papers.
Please always use the tools provided to answer a question. Do not rely on prior knowledge.\

""",
verbose=True)
agent = AgentRunner(agent_worker)

In [0]:
response = agent.query(
    "Tell me about FOH"
)

In [0]:
print (str(response))