In [1]:
import tqdm
import numpy as np

# API Setup

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

True

# Dataset

In [3]:
from datasets import load_dataset
data = load_dataset("EleutherAI/wikitext_document_level", "wikitext-103-raw-v1", trust_remote_code=False)

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
sample_size = 100
data_index = 38
np.random.seed(42)
data = data["train"][:sample_size]

# Preprocessing

In [5]:
# Data cleaning
import re
pattern = re.compile(r" @(.)@ ")

# Run this across the entire dataset
for i, page in enumerate(data["page"]):
    data["page"][i] = re.sub(pattern, r"\1", page)

In [6]:
# Data enrichment
def extract_metadata(data):
    title_pattern = re.compile(r"\s=\s([^=]{1,50})\s=\s")
    title = [item for item in re.findall(title_pattern, data)]
    # The regex above isn't perfect so we take the first match as the title 
    if len(title) > 0:
        title = title[0]
    else:
        title = "Unknown Title"
    return {"title": title}

In [7]:
# Load documents
from llama_index.core import Document

documents = []
for i in tqdm.tqdm(range(len(data["page"]))):
    documents.append(
        Document(
            text=data["page"][i],
            metadata=extract_metadata(data["page"][i]),
        )
    )

100%|██████████| 100/100 [00:00<00:00, 4140.23it/s]


# Chunking

In [8]:
from llama_index.core.node_parser import SentenceSplitter
chunker = SentenceSplitter(chunk_size=512, chunk_overlap=20)

In [9]:
nodes = chunker.get_nodes_from_documents(documents, show_progress=True)

Parsing nodes:   0%|          | 0/100 [00:00<?, ?it/s]

In [10]:
print(f"Documents before chunking: {len(documents)}")
print(f"Documents after chunking: {len(nodes)}")
nodes[0]

Documents before chunking: 100
Documents after chunking: 837


TextNode(id_='e697c15b-1485-4fc9-9721-b93ce6b9a91e', embedding=None, metadata={'title': 'Valkyria Chronicles III'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='23b40453-88f9-4896-b836-02fa1bda18a9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': 'Valkyria Chronicles III'}, hash='f7aadfb478d20e04be770cd882b5e6a44c185eb28a53810838586313c39ccc7c'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='48905c58-4673-464a-8f2d-d9be3d0604b9', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='32c32274359d6ec7e58a31e940b4b433c53354c2fb60611c7cc6bd2c324d075c')}, text='= Valkyria Chronicles III = \n \n Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role-playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in Jan

# Embedding

In [11]:
# Load embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5", embed_batch_size=32)



# Indexing

In [12]:
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex(nodes, embed_model=embedding_model, show_progress=True)

Generating embeddings:   0%|          | 0/837 [00:00<?, ?it/s]

# Vectorstore / Vector DB

In [13]:
# https://docs.llamaindex.ai/en/stable/examples/vector_stores/LanceDBIndexDemo/
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.core import StorageContext

# Create your DB locally
vector_store = LanceDBVectorStore(
    uri="./lancedb", table_name="test"
)
# Link to the collection on llamaindex
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [14]:
# Load the index we created in the previous notebook
vector_store = LanceDBVectorStore(
    uri="./lancedb", table_name="test"
)
vdb_index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embedding_model,
)

# Basic RAG

So now we have an index and a way to query the index. To close the RAG loop here, we need to get an input from a user, pass it to the retriever, send the retrieved context to the LLM, and return the output. 

In [15]:
user_input = "How many points did Michael Jordan actually score in his final NBA game?"

In [16]:
from llama_index.core.llms import ChatMessage
# Build our prompt
system_prompt = "You are a helpful AI assistant that answers questions and always cites sources (document titles & ID) as well as the relevant snippets. If the information is not present in the context, you will say 'I don't know'."

system_prompt = ChatMessage(content=system_prompt, role="system")

prompt = """Context: 
{context} 
-----
Question: {question} 
Answer: `answer`
Source: `source`
Relevant Snippet: `snippet`
"""

### LLM Choice

This demo supports two different APIs for models - OpenAI and AnyScale. Specifically, we use `gpt-4o` and `meta-llama/Meta-Llama-3-70B-Instruct` but any supported model within these two environments should work. 

In [17]:
def retrieve(index, user_input):
    results = index.as_retriever(similarity_top_k=3).retrieve(user_input)
    return "\n----------------\n".join([f"Title: {result.metadata['title']}\nDocument ID: {result.node_id}\n{result.text}" for result in results])

In [22]:
# OpenAI + Flat Index
from llama_index.llms.openai import OpenAI

# Set up OpenAI client - API key is handled in your .env file
llm = OpenAI(model="gpt-4o", temperature=0.1, max_tokens=2048)

# Run retrieval
context = retrieve(index, user_input)

# Generate output
message = ChatMessage(content=prompt.format(context=context, question=user_input), role="user")
print(llm.chat([system_prompt, message]).message.content)

Michael Jordan scored 15 points in his final NBA game.

Source: Michael Jordan (Document ID: 908cf6b2-8c31-4219-baa2-50bbeb9a351a)
Relevant Snippet: "After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter... At 1:45, Jordan was intentionally fouled by the 76ers' Eric Snow, and stepped to the line to make both free throws."


In [19]:
# OpenAI + ANN Index

# Set up OpenAI client - API key is handled in your .env file
llm = OpenAI(model="gpt-4o", temperature=0.1, max_tokens=2048)

# Run retrieval
context = retrieve(vdb_index, user_input)

# Generate output
message = ChatMessage(content=prompt.format(context=context, question=user_input), role="user")
print(llm.chat([system_prompt, message]).message.content)

Michael Jordan scored 15 points in his final NBA game.

Source: Michael Jordan (Document ID: 5ab87d87-a790-4c80-b8bd-d3b9525e442d)
Relevant Snippet: "After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter... At 1:45, Jordan was intentionally fouled by the 76ers' Eric Snow, and stepped to the line to make both free throws."


In [20]:
# AnyScale + Flat Index
from llama_index.llms.anyscale import Anyscale

# Set up AnyScale client - API key is handled in your .env file
llm = Anyscale(model="meta-llama/Meta-Llama-3-70B-Instruct", temperature=0.1, max_tokens=2048)

# Run retrieval
context = retrieve(index, user_input)

# Generate output
message = ChatMessage(content=prompt.format(context=context, question=user_input), role="user")
print(llm.chat([system_prompt, message]).message.content)

Answer: 13 points
Source: Michael Jordan (Document ID: 908cf6b2-8c31-4219-baa2-50bbeb9a351a)
Relevant Snippet: "After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter and with his team trailing the Philadelphia 76ers, 75–56."


In [21]:
# AnyScale + ANN Index
from llama_index.llms.anyscale import Anyscale

# Set up AnyScale client - API key is handled in your .env file
llm = Anyscale(model="meta-llama/Meta-Llama-3-70B-Instruct", temperature=0.1, max_tokens=2048)

# Run retrieval
context = retrieve(vdb_index, user_input)

# Generate output
message = ChatMessage(content=prompt.format(context=context, question=user_input), role="user")
print(llm.chat([system_prompt, message]).message.content)

Answer: 13 points
Source: Michael Jordan (Document ID: 5ab87d87-a790-4c80-b8bd-d3b9525e442d)
Relevant Snippet: "After scoring only 13 points in the game, Jordan went to the bench with 4 minutes and 13 seconds remaining in the third quarter and with his team trailing the Philadelphia 76ers, 75–56."
