### RAG over multiple documents
TODO
- Add more documents
- Maybe use summary tool to auto-generate summaries
- Persist for every doc the index as well as the generated summary

In [140]:
%load_ext autoreload
%autoreload 2

import os
import nest_asyncio
from utils.gdrive_api import download_gfile

nest_asyncio.apply()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [141]:
USE_OPENAI = True

CHUNK_SIZE = 512
CHUNK_OVERLAP = 20

In [142]:
# Download files from Drive - Uncomment and add filenames as required
# filenames = [
#     'Trippy Trip']
# for filename in filenames:
#     output_path = "../data/" + (filename if filename.endswith('.pdf') else filename + '.pdf')
#     download_gfile(filename, output_path)

In [143]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core.node_parser import SentenceSplitter

if USE_OPENAI:
    Settings.llm = OpenAI(model="gpt-3.5-turbo", api_key=os.getenv('OPENAI_API_KEY'))
    Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
else:
    Settings.llm = Ollama(model="llama3:instruct", request_timeout=120.0)
    Settings.embed_model = OllamaEmbedding(
        model_name="llama3:instruct",
        base_url="http://localhost:11434",
        ollama_additional_kwargs={"mirostat": 0})

Settings.node_parser = SentenceSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

In [144]:
input_files = [
    "idpp.pdf", "metagpt.pdf", "state_of_the_union.txt", "Federal Tax Return 2021.pdf", "Financial Assessment.docx",
    "IELTS Result - March 2023.pdf", "Shashank Verma - Resume.pdf", 'Shashank Verma - Resume (2024).pdf', 'LLM Pointers.pdf',
    'Multi-class classification via Transfer Learning for classifying dog breeds from images.pdf',
    'Incorporating adaptive feedback capability to Interactive Videos tutor in Project Ivy.pdf', 'Spring 2024 Reflection Report - Shashank.pdf',
    'Shashank - Medical Log.pdf', 'Pranjali - Medical Log.pdf', 'Shashank - 2023-06-16 Lab Results.pdf', 'Pranjali - 2023-06-16 Lab Results.pdf',
    'Trippy Trip.pdf']
input_files = ["../data/" + item for item in input_files]

file_summaries = [
    "Useful for retrieving specific context from the iDPP paper which is about predicting ALSFRS-R rating scores for ALS patients.",
    "Useful for retrieving specific context from the MetaGPT paper.",
    "Useful for retrieving specific context from the state of the union speech by the president.",
    "Useful for retrieving specific context from the Federal Tax Return of 2021 detailing things like gross income, taxable income, tax paid and so on",
    "Useful for retrieving specific context from Financial Assessment detailing how much to spend per month on various things",
    "Useful for retrieving specific context from my IELTS result I got in 2023 denoting my English speaking skills",
    "Useful for retrieving specific context from my resume of 2023 specifying what projects I've worked on, where I studied, what my qualifications are, etc",
    "Useful for retrieving specific context from my resume of 2024 specifying what projects I've worked on, where I studied, what my qualifications are, etc",
    "Useful for retrieving specific context from some pointers I've gathered regarding LLMs",
    "Useful for retrieving specific context from a project I did where I had to do multi-class classification via transfer learning for classifying dog breeds from images",
    "Useful for retrieving specific context from my project proposal for 'Ivy' master's project with Professor Ashok Goel where we want to build an interactive tutor for incorporating adaptive feedback",
    "Useful for retrieving specific context from my reflection report at the end of Spring semester in 2024 for Project 'Ivy' - my master's project",
    "Useful for retrieving specific context from my log of medical tests and conditions over the years",
    "Useful for retrieving specific context from Pranjali's log of medical tests and conditions over the years",
    "Useful for retrieving specific context from my bloodwork lab results from 2023",
    "Useful for retrieving specific context from Pranjali's bloodwork lab results from 2023",
    "Useful for retrieving specific context from all our trips that Pranjali and myself have taken over the years including trip plans and how much it cost on those trips",
]
assert len(input_files) == len(file_summaries)

In [145]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import StorageContext, load_index_from_storage

vector_indices = []
for file in input_files:
    filename = os.path.basename(file).split('.')[0]
    # Read from storage if using OPENAI and the embeddings exist
    if USE_OPENAI and os.path.exists(f"file_embeddings/openAI/{filename}"):
        print (f"Retrieving vector_index for {filename} from storage")
        storage_context = StorageContext.from_defaults(persist_dir=f"file_embeddings/openAI/{filename}")
        vector_index = load_index_from_storage(storage_context=storage_context)
    # Otherwise create index using regular method
    else:
        print (f"Creating vector_index for {filename}")
        loader = SimpleDirectoryReader(input_files=[file])
        documents = loader.load_data()

        # splitter = Settings.node_parser
        # nodes = splitter.get_nodes_from_documents(documents)
        # index = VectorStoreIndex(nodes)
        vector_index = VectorStoreIndex.from_documents(documents)

    vector_indices.append(vector_index)

Retrieving vector_index for idpp from storage
Retrieving vector_index for metagpt from storage
Retrieving vector_index for state_of_the_union from storage
Retrieving vector_index for Federal Tax Return 2021 from storage
Retrieving vector_index for Financial Assessment from storage
Retrieving vector_index for IELTS Result - March 2023 from storage
Retrieving vector_index for Shashank Verma - Resume from storage
Retrieving vector_index for Shashank Verma - Resume (2024) from storage
Retrieving vector_index for LLM Pointers from storage
Retrieving vector_index for Multi-class classification via Transfer Learning for classifying dog breeds from images from storage
Retrieving vector_index for Incorporating adaptive feedback capability to Interactive Videos tutor in Project Ivy from storage
Retrieving vector_index for Spring 2024 Reflection Report - Shashank from storage
Retrieving vector_index for Shashank - Medical Log from storage
Retrieving vector_index for Pranjali - Medical Log from st

In [146]:
if USE_OPENAI:
    for i in range(len(input_files)):
        filename = os.path.basename(input_files[i]).split('.')[0]
        print (f"Saving vector_index for {filename}")
        vector_indices[i].storage_context.persist(f"file_embeddings/openAI/{filename}")

Saving vector_index for idpp
Saving vector_index for metagpt
Saving vector_index for state_of_the_union
Saving vector_index for Federal Tax Return 2021
Saving vector_index for Financial Assessment
Saving vector_index for IELTS Result - March 2023
Saving vector_index for Shashank Verma - Resume
Saving vector_index for Shashank Verma - Resume (2024)
Saving vector_index for LLM Pointers
Saving vector_index for Multi-class classification via Transfer Learning for classifying dog breeds from images
Saving vector_index for Incorporating adaptive feedback capability to Interactive Videos tutor in Project Ivy
Saving vector_index for Spring 2024 Reflection Report - Shashank
Saving vector_index for Shashank - Medical Log
Saving vector_index for Pranjali - Medical Log
Saving vector_index for Shashank - 2023-06-16 Lab Results
Saving vector_index for Pranjali - 2023-06-16 Lab Results
Saving vector_index for Trippy Trip


In [147]:
from llama_index.core.tools import QueryEngineTool

query_engine_tools = []
for i in range(len(input_files)):
    query_engine_tools.append(QueryEngineTool.from_defaults(
        query_engine=vector_indices[i].as_query_engine(),
        description=file_summaries[i],
    ))

len(query_engine_tools)

17

In [148]:
# # Using ToolRetrieverRouterQueryEngine
# from llama_index.core import VectorStoreIndex
# from llama_index.core.objects import ObjectIndex
# from llama_index.core.query_engine import ToolRetrieverRouterQueryEngine

# obj_index = ObjectIndex.from_objects(query_engine_tools, index_cls=VectorStoreIndex)
# query_engine = ToolRetrieverRouterQueryEngine(obj_index.as_retriever())

# response = query_engine.query("What do the authors say in iDPP paper")
# print(str(response))

In [149]:
# Using RouterQueryEngine
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=query_engine_tools,
    verbose=True
)

In [153]:
query = "Where did we go in 2017?"
respones = query_engine.query(query)
print (respones)

[1;3;38;5;200mSelecting query engine 16: This choice is relevant as it pertains to all the trips taken over the years, including trip plans and costs. By referring to this choice, you can retrieve specific context about the trips taken in 2017..
[0mEuro 2.0


### Sample Questions
- What do the authors say in iDPP paper
- What were the models tried for predicting ALS progression in the idpp paper?
- What was the validation strategy used by the authors in the idpp paper?
- Which model performed the best with lowest RMSE in the idpp paper?
- What did the president say about Justice Breyer?
- How do MetaGPT agents share information with other agents?
- What was my reading score in IELTS exam?
- What is my educational background according to my resume?
- How much do I expect to spend per month on Hobbies?
- What was my total tax amount to be paid in 2021?
- What was my total gross income in 2021?
- What is my OpenAI secret key?
- What were my deliverables in Ivy?
- Are LLMs free to use?
- Describe my methodology for classifying dog breeds project.
- Give me my medical history.
- What are some of the notable projects I've worked on in my professional career?
- What is Pranjali's medical history?
- What is the summary of my latest bloodwork?
- Where did we go in 2017?