### RAG over multiple documents
TODO
- Add more documents
- Maybe use summary tool to auto-generate summaries
- Persist for every doc the index as well as the generated summary

In [225]:
import os
import nest_asyncio

nest_asyncio.apply()

In [226]:
USE_OPENAI = False

CHUNK_SIZE = 512
CHUNK_OVERLAP = 20

In [227]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.node_parser import SentenceSplitter

if USE_OPENAI:
    Settings.llm = OpenAI(model="gpt-3.5-turbo", api_key=os.getenv('OPENAI_API_KEY'))
    Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
else:
    Settings.llm = Ollama(model="llama3:instruct", request_timeout=120.0)
    Settings.embed_model = OllamaEmbedding(
        model_name="llama3:instruct",
        base_url="http://localhost:11434",
        ollama_additional_kwargs={"mirostat": 0})

Settings.node_parser = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

In [228]:
input_files=[
    "../data/idpp.pdf", "../data/metagpt.pdf", "../data/state_of_the_union.txt", "../data/Federal Tax Return 2021.pdf",
    "../data/Financial Assessment.docx", "../data/IELTS Result - March 2023.pdf", "../data/Shashank Verma - Resume.pdf"]#,  "../data/Income Tax Return Transcript 2020.pdf"]

file_summaries = [
    "Useful for retrieving specific context from the iDPP paper which is about predicting ALSFRS-R rating scores for ALS patients.",
    "Useful for retrieving specific context from the MetaGPT paper.",
    "Useful for retrieving specific context from the state of the union speech by the president.",
    "Useful for retrieving specific context from the Federal Tax Return of 2021 detailing things like gross income, taxable income, tax paid and so on",
    "Useful for retrieving specific context from Financial Assessment detailing how much to spend per month on various things",
    "Useful for retrieving specific context from my IELTS result I got in 2023 denoting my English speaking skills",
    "Useful for retrieving specific context from my resume specifying what projects I've worked on, where I studied, what my qualifications are, etc"
]

In [229]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import StorageContext, load_index_from_storage

vector_indices = []
for file in input_files:
    filename = os.path.basename(file).split('.')[0]
    # Read from storage if using OPENAI and the embeddings exist
    if USE_OPENAI and os.path.exists(f"file_embeddings/openAI/{filename}"):
        print (f"Retrieving vector_index for {filename} from storage")
        storage_context = StorageContext.from_defaults(persist_dir=f"file_embeddings/openAI/{filename}")
        vector_index = load_index_from_storage(storage_context=storage_context)
    # Otherwise create index using regular method
    else:
        print (f"Creating vector_index for {filename}")
        loader = SimpleDirectoryReader(input_files=[file])
        documents = loader.load_data()

        # splitter = Settings.node_parser
        # nodes = splitter.get_nodes_from_documents(documents)
        # index = VectorStoreIndex(nodes)
        vector_index = VectorStoreIndex.from_documents(documents)

    vector_indices.append(vector_index)

Retrieving vector_index for idpp from storage
Retrieving vector_index for metagpt from storage
Retrieving vector_index for state_of_the_union from storage
Creating vector_index for Federal Tax Return 2021
Creating vector_index for Financial Assessment
Creating vector_index for IELTS Result - March 2023
Creating vector_index for Shashank Verma - Resume


In [230]:
if USE_OPENAI:
    for i in range(len(input_files)):
        filename = os.path.basename(input_files[i]).split('.')[0]
        print (f"Saving vector_index for {filename}")
        vector_indices[i].storage_context.persist(f"file_embeddings/openAI/{filename}")

Saving vector_index for idpp
Saving vector_index for metagpt
Saving vector_index for state_of_the_union
Saving vector_index for Federal Tax Return 2021
Saving vector_index for Financial Assessment
Saving vector_index for IELTS Result - March 2023
Saving vector_index for Shashank Verma - Resume


In [231]:
from llama_index.core.tools import QueryEngineTool

query_engine_tools = []
for i in range(len(input_files)):
    query_engine_tools.append(QueryEngineTool.from_defaults(
        query_engine=vector_indices[i].as_query_engine(),
        description=file_summaries[i],
    ))

len(query_engine_tools)

7

In [232]:
# # Using ToolRetrieverRouterQueryEngine
# from llama_index.core import VectorStoreIndex
# from llama_index.core.objects import ObjectIndex
# from llama_index.core.query_engine import ToolRetrieverRouterQueryEngine

# obj_index = ObjectIndex.from_objects(query_engine_tools, index_cls=VectorStoreIndex)
# query_engine = ToolRetrieverRouterQueryEngine(obj_index.as_retriever())

# response = query_engine.query("What do the authors say in iDPP paper")
# print(str(response))

In [233]:
# Using RouterQueryEngine
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=query_engine_tools,
    verbose=True
)

In [234]:
response = query_engine.query("What do the authors say in iDPP paper")
print (response)

[1;3;38;5;200mSelecting query engine 0: The iDPP paper is specifically mentioned in choice 1, indicating that it is relevant for retrieving specific context from the iDPP paper..
[0mThe authors in the iDPP paper discuss their approach to predicting ALSFRS-R scores using various techniques, starting with a naive model as a baseline and then exploring different Machine Learning algorithms for regression, along with a Long Short-Term Memory (LSTM) neural network to capture temporal dependencies in sequential sensor data. They evaluate the performance of these models using Root Mean Squared Error (RMSE) and Mean Absolute Error (MAE) metrics. The paper also covers related work in the field, the methodology employed, experimental results, implications of their findings, future work directions, and concludes the study.


In [235]:
response = query_engine.query("What were the models tried for predicting ALS progression in the idpp paper?.")
print (response)

[1;3;38;5;200mSelecting query engine 0: The iDPP paper is specifically mentioned in choice 1, which is about predicting ALSFRS-R rating scores for ALS patients. This makes it the most relevant choice for the question about models tried for predicting ALS progression in the iDPP paper..
[0mThe models tried for predicting ALS progression in the idpp paper included a naive model that carried the last observed value forward, various Machine Learning algorithms for regression, and a Long Short-Term Memory (LSTM) neural network to capture temporal dependencies in the sequential sensor data.


In [236]:
response = query_engine.query("What was the validation strategy used by the authors in the idpp paper?.")
print (response)

[1;3;38;5;200mSelecting query engine 0: The iDPP paper is specifically mentioned in choice 1, indicating that it is relevant to retrieving specific context from the paper, which would likely include details about the validation strategy used by the authors..
[0mGrid search using cross validation on the entire training data was the validation strategy used by the authors in the idpp paper.


In [237]:
response = query_engine.query("Which model performed the best with lowest RMSE in the idpp paper?.")
print (response)

[1;3;38;5;200mSelecting query engine 0: The iDPP paper is specifically mentioned in choice 1, making it the most relevant option for retrieving specific context about the model performance in the iDPP paper..
[0mThe ElasticNet model performed the best with the lowest RMSE in the idpp paper.


In [238]:
response = query_engine.query("What did the president say about Justice Breyer")
print (response)

[1;3;38;5;200mSelecting query engine 2: The State of the Union speech by the president is likely to contain information about Justice Breyer.
[0mThe president honored Justice Breyer for his service to the country, acknowledging him as an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court.


In [239]:
response = query_engine.query("How do MetaGPT agents share information with other agents?")
print (response)

[1;3;38;5;200mSelecting query engine 1: MetaGPT paper is likely to contain information on how MetaGPT agents share information with other agents..
[0mMetaGPT agents share information with other agents by utilizing a shared message pool. This pool allows agents to exchange messages directly, publish structured messages, and access information from other entities transparently. Agents can retrieve required information from the shared pool without needing to inquire about other agents or wait for their responses, thereby enhancing communication efficiency.


In [240]:
response = query_engine.query("What was my reading score in IELTS exam?")
print (response)

[1;3;38;5;200mSelecting query engine 5: The IELTS exam is specifically mentioned in choice 6, which is about retrieving specific context from the IELTS result..
[0mYour reading score in the IELTS exam was 8.5.


In [241]:
response = query_engine.query("What is my educational background according to my resume?")
print (response)

[1;3;38;5;200mSelecting query engine 6: My resume specifies where I studied, which is relevant to my educational background..
[0mYou have completed an Artificial Intelligence Professional Program at Stanford University from May 2021 to August 2022, and you also hold a Bachelor's degree in Computer Science and Engineering from the Indian Institute of Technology, Delhi, New Delhi, from July 2009 to May 2013.


In [242]:
response = query_engine.query("How much do I expect to spend per month on Hobbies?")
print (response)

[1;3;38;5;200mSelecting query engine 4: Financial Assessment detailing how much to spend per month on various things.
[0mYou can expect to spend $500 per month on Hobbies.


In [243]:
response = query_engine.query("What was my total tax amount to be paid in 2021?")
print (response)

[1;3;38;5;200mSelecting query engine 3: The Federal Tax Return of 2021 detailing things like gross income, taxable income, tax paid and so on would be the most relevant choice for retrieving information about the total tax amount to be paid in 2021..
[0mYour total tax amount to be paid in 2021 was $207,117.00.


In [246]:
response = query_engine.query("What was my total gross income in 2021?")
print (response)

[1;3;38;5;200mSelecting query engine 3: The Federal Tax Return of 2021 would detail your gross income for that year..
[0mYour total gross income in 2021 was $746,812.00.
