In [1]:
import tqdm
import numpy as np
import utils

# API Setup

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

True

# Dataset

We've abstracted away the code from the previous notebooks to focus on the concepts from this notebook.

In [3]:
data = utils.load_data(sample_size=100)

Repo card metadata block was not found. Setting CardData to empty.


# Preprocessing
We've abstracted away the code from the previous notebooks to focus on the concepts from this notebook.

In [4]:
documents = utils.preprocess_data(data)

100%|██████████| 100/100 [00:00<00:00, 2821.58it/s]


# Chunking
We've abstracted away the code from the previous notebooks to focus on the concepts from this notebook.

In [5]:
nodes = utils.chunk_documents(documents)

Parsing nodes:   0%|          | 0/100 [00:00<?, ?it/s]

Documents before chunking: 100
Documents after chunking: 837


# Embedding
We've abstracted away the code from the previous notebooks to focus on the concepts from this notebook.

In [6]:
embedding_model = utils.get_embedding_model(model_name="BAAI/bge-small-en-v1.5")



# Indexing

We use the same indexes from the previous notebook.

In [7]:
# Recreate the flat in-memory index
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex(nodes, embed_model=embedding_model, show_progress=True)

Generating embeddings:   0%|          | 0/837 [00:00<?, ?it/s]

In [8]:
# Load the ANN index we created in the previous notebook from disk
from llama_index.vector_stores.lancedb import LanceDBVectorStore
vector_store = LanceDBVectorStore(
    uri="./lancedb", table_name="test"
)
vdb_index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embedding_model,
)

# Basic RAG

So now we have an index and a way to query the index. To close the RAG loop here, we need to get an input from a user, pass it to the retriever, send the retrieved context to the LLM, and return the output. 

In [9]:
user_input = "How many points did Michael Jordan actually score in his final NBA game?"

In [10]:
from llama_index.core.llms import ChatMessage
# Build our prompt
system_prompt = "You are a helpful AI assistant that answers questions after carefully reading all the provided context. You always cite sources (document titles) and also quote the relevant snippets. Information may be spread across multiple documents. If the information is not present in any of the contexts, you will say 'I don't know'."

prompt = """Context: 
{context_str} 
-----
Question: {query_str} 
Answer: `answer`
Source: `source`
Relevant Snippet: `snippet`
"""

### LLM Choice

This demo supports two different APIs for models - OpenAI and AnyScale. Specifically, we use `gpt-4o` and `meta-llama/Meta-Llama-3-70B-Instruct` but any supported model within these two environments should work. 

In [11]:
def retrieve(index, user_input):
    results = index.as_retriever(similarity_top_k=3).retrieve(user_input)
    return "\n----------------\n".join([f"Title: {result.metadata['title']}\n{result.text}" for result in results])

In [12]:
def rag(llm, index, system_prompt, prompt, user_input):
    # Retrieve
    context = retrieve(index, user_input)
    # Augment
    formatted_prompt = prompt.format(context_str=context, query_str=user_input)
    # Generate
    messages = [
        ChatMessage(content=system_prompt, role="system"),
        ChatMessage(content=formatted_prompt, role="user")
    ]
    result = llm.chat(messages)
    return result

In [13]:
# OpenAI + Flat Index
from llama_index.llms.openai import OpenAI

# Set up OpenAI client - API key is handled in your .env file
llm = OpenAI(model="gpt-4o", temperature=0.1, max_tokens=2048)

# Generate output
print(rag(llm, index, system_prompt, prompt, user_input).message.content)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Answer: Michael Jordan scored 15 points in his final NBA game.

Source: Michael Jordan

Relevant Snippet: "Jordan's final NBA game was on April 16, 2003 in Philadelphia. After scoring only 13 points in the game, Jordan went to the bench... Jordan finally rose from the bench and re-entered the game... At 1:45, Jordan was intentionally fouled by the 76ers' Eric Snow, and stepped to the line to make both free throws."


In [14]:
# OpenAI + ANN Index

# Set up OpenAI client - API key is handled in your .env file
llm = OpenAI(model="gpt-4o", temperature=0.1, max_tokens=2048)

# Generate output
print(rag(llm, vdb_index, system_prompt, prompt, user_input).message.content)

Answer: Michael Jordan scored 15 points in his final NBA game.

Source: Michael Jordan

Relevant Snippet: "After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter... At 1:45, Jordan was intentionally fouled by the 76ers' Eric Snow, and stepped to the line to make both free throws."


In [15]:
# AnyScale + Flat Index
from llama_index.llms.anyscale import Anyscale

# Set up AnyScale client - API key is handled in your .env file
llm = Anyscale(model="meta-llama/Meta-Llama-3-70B-Instruct", temperature=0.1, max_tokens=2048)

# Generate output
print(rag(llm, index, system_prompt, prompt, user_input).message.content)

Answer: 13 points
Source: Michael Jordan
Relevant Snippet: "After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter and with his team trailing the Philadelphia 76ers, 75–56."


In [16]:
# AnyScale + ANN Index
from llama_index.llms.anyscale import Anyscale

# Set up AnyScale client - API key is handled in your .env file
llm = Anyscale(model="meta-llama/Meta-Llama-3-70B-Instruct", temperature=0.1, max_tokens=2048)

# Generate output
print(rag(llm, vdb_index, system_prompt, prompt, user_input).message.content)

Answer: 13 points
Source: Michael Jordan
Relevant Snippet: "After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter and with his team trailing the Philadelphia 76ers, 75–56."
