<a href="https://colab.research.google.com/github/devanshu1204/Exp_Agentic-RAG/blob/main/Copy_of_Working_Agentic_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install python-dotenv==1.0.0
!pip install llama-index==0.10.27
!pip install llama-index-readers-file

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [4]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool, QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters, FilterCondition
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from typing import List, Optional
import numpy as np

### Chunking and creating Vector Embeddings

In [6]:
# load documents
documents = SimpleDirectoryReader(input_files=[
    "A_mon.pdf",
    "D_mon.pdf"
]).load_data()

splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents)

# Check if nodes are created successfully
if nodes:
        print("Nodes created successfully.")
        print(f"Total nodes created: {len(nodes)}")

        # Print some nodes for debugging
        print("Printing first 5 nodes:")
        for i, node in enumerate(nodes[:5]):
            print(f"Node {i+1}: {node}")
else:
        print("Failed to create nodes.")


print("starting vector indexing by creating embeddings using ada 002")

vector_index = VectorStoreIndex(nodes)

print("finished vector indexing")

Nodes created successfully.
Total nodes created: 2
Printing first 5 nodes:
Node 1: Node ID: 61ceb9be-b6dd-4ccb-bc96-84507a1de1a2
Text: Anukalp’s monthly income is 25 thousand US $
Node 2: Node ID: 359df5e5-ee42-454e-9e6a-44087972464d
Text: Devanshu’s monthly income is 50 thousand US $
starting vector indexing by creating embeddings using ada 002
finished vector indexing


### Storing vector Embeddings

In [7]:
vector_index.storage_context.persist()

In [9]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context=storage_context)

### Function for Creating Tools

In [10]:
def get_doc_tools(
    file_path: str,
    name: str,
) -> str:
    """Get vector query and summary query tools from a document."""


    #print("starting embeddings")

    # Create embeddings for the nodes using the sentence transformer model
    #embeddings = embedding_model.encode(tokenized_texts)
    #for i, node in enumerate(nodes):
    #    node.embedding = embeddings[i].tolist()

    #for i, node in enumerate(nodes):
     #print(f"Node {i+1} embedding:")
     #print(node.embedding)


    #print("finished embeddings")



    def vector_query(
        query: str,
        page_numbers: Optional[List[str]] = None
    ) -> str:
        """Use to answer questions over a given paper.

        Useful if you have specific questions over the paper.
        Always leave page_numbers as None UNLESS there is a specific page you want to search for.

        Args:
            query (str): the string query to be embedded.
            page_numbers (Optional[List[str]]): Filter by set of pages. Leave as NONE
                if we want to perform a vector search
                over all pages. Otherwise, filter by the set of specified pages.

        """

        page_numbers = page_numbers or []
        metadata_dicts = [
            {"key": "page_label", "value": p} for p in page_numbers
        ]

        query_engine = vector_index.as_query_engine(
            similarity_top_k=2,
            filters=MetadataFilters.from_dicts(
                metadata_dicts,
                condition=FilterCondition.OR
            )
        )
        response = query_engine.query(query)
        return response


    vector_query_tool = FunctionTool.from_defaults(
        name=f"vector_tool_{name}",
        fn=vector_query
    )

    summary_index = SummaryIndex(nodes)

    summary_query_engine = summary_index.as_query_engine(
        response_mode="tree_summarize",
        use_async=True,
    )
    summary_tool = QueryEngineTool.from_defaults(
        name=f"summary_tool_{name}",
        query_engine=summary_query_engine,
        description=(
            f"Useful for summarization questions related to {name}"
        ),
    )

    return vector_query_tool, summary_tool

### Calling the Function for creating Tools

In [None]:
papers = [
    "A_mon.pdf",
    "D_mon.pdf"
]

paper_to_tools_dict = {}
for paper in papers:
    print(f"Getting tools for paper: {paper}")
    vector_tool, summary_tool = get_doc_tools(paper, paper.replace(".pdf", ""))
    paper_to_tools_dict[paper] = [vector_tool, summary_tool]


In [20]:
initial_tools = [t for paper in papers for t in paper_to_tools_dict[paper]]

In [21]:
print(initial_tools)

[<llama_index.core.tools.function_tool.FunctionTool object at 0x7c95d7d3f4c0>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7c95d4f24d00>, <llama_index.core.tools.function_tool.FunctionTool object at 0x7c95d4f265f0>, <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7c95d4f25bd0>]


In [22]:
from llama_index.llms.openai import OpenAI
llm = OpenAI(model="gpt-3.5-turbo")

In [23]:
len(initial_tools)

4

In [24]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner

agent_worker = FunctionCallingAgentWorker.from_tools(
    initial_tools,
    llm=llm,
    verbose=True
)
agent = AgentRunner(agent_worker)

### QnA

In [None]:
response = agent.query(
    "Who earns more salary"
)

In [None]:
response = agent.query(
    "Who earns 600,000 yearly"
)

### Embeddings are being stored here if we directly use vectorStoreIndex since it handles the embeddings internally and stores them in memeory

In [None]:
if hasattr(vector_index, 'vector_store'):
        print("Vector index vector_store attributes:")
        print(vector_index.vector_store.__dict__)