In [1]:
import tqdm
import numpy as np

# API Setup

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

True

# Dataset

In [3]:
from datasets import load_dataset
data = load_dataset("EleutherAI/wikitext_document_level", "wikitext-103-raw-v1", trust_remote_code=False)

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
sample_size = 100
data_index = 38
np.random.seed(42)
data = data["train"][:sample_size]

# Preprocessing

In [5]:
# Data cleaning
import re
pattern = re.compile(r" @(.)@ ")

# Run this across the entire dataset
for i, page in enumerate(data["page"]):
    data["page"][i] = re.sub(pattern, r"\1", page)

In [6]:
# Data enrichment
def extract_metadata(data):
    title_pattern = re.compile(r"\s=\s([^=]{1,50})\s=\s")
    title = [item for item in re.findall(title_pattern, data)]
    # The regex above isn't perfect so we take the first match as the title 
    if len(title) > 0:
        title = title[0]
    else:
        title = "Unknown Title"
    return {"title": title}

In [7]:
# Load documents
from llama_index.core import Document

documents = []
for i in tqdm.tqdm(range(len(data["page"]))):
    documents.append(
        Document(
            text=data["page"][i],
            metadata=extract_metadata(data["page"][i]),
        )
    )

100%|██████████| 100/100 [00:00<00:00, 2436.76it/s]


# Chunking

In [8]:
from llama_index.core.node_parser import SentenceSplitter
chunker = SentenceSplitter(chunk_size=512, chunk_overlap=20)

In [9]:
nodes = chunker.get_nodes_from_documents(documents, show_progress=True)

Parsing nodes:   0%|          | 0/100 [00:00<?, ?it/s]

In [10]:
print(f"Documents before chunking: {len(documents)}")
print(f"Documents after chunking: {len(nodes)}")
nodes[0]

Documents before chunking: 100
Documents after chunking: 837


TextNode(id_='3fb827a4-0ef1-4d54-8979-0c726618f7ab', embedding=None, metadata={'title': 'Valkyria Chronicles III'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='a2d0c010-e2e1-446d-a35a-20203ca7dd54', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': 'Valkyria Chronicles III'}, hash='f7aadfb478d20e04be770cd882b5e6a44c185eb28a53810838586313c39ccc7c'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='9bcb34c8-5580-478b-8e75-81caa68613b8', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='32c32274359d6ec7e58a31e940b4b433c53354c2fb60611c7cc6bd2c324d075c')}, text='= Valkyria Chronicles III = \n \n Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role-playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in Jan

# Embedding

In [11]:
# Load embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5", embed_batch_size=32)



# Indexing

Llamaindex stores embeddings in a [VectorStoreIndex](https://docs.llamaindex.ai/en/stable/module_guides/indexing/vector_store_index/) object. This can be any used with any vector store [supported](https://docs.llamaindex.ai/en/stable/module_guides/storing/vector_stores/) by Llamaindex. By default, this is a [SimpleIndex](https://docs.llamaindex.ai/en/stable/api_reference/storage/vector_store/simple/) which is a flat index. 

We can load all our chunks and embed them when creating a VectorStoreIndex:

In [12]:
from llama_index.core import VectorStoreIndex
index = VectorStoreIndex(nodes, embed_model=embedding_model, show_progress=True)

Generating embeddings:   0%|          | 0/837 [00:00<?, ?it/s]

### Query the Index

In [13]:
query = "How many points did Michael Jordan actually score in his final NBA game?"
results = index.as_retriever(similarity_top_k=3).retrieve(query)

print(f"Query: {query}")
print("---" * 30)
for i, result in enumerate(results):
    print(f"Rank {i+1}: {result.metadata['title']} ({result.score})")
    print(result.text[:100] + "...")
    print("---" * 30)

Query: How many points did Michael Jordan actually score in his final NBA game?
------------------------------------------------------------------------------------------
Rank 1: Michael Jordan (0.797218871120604)
With the recognition that 2002 – 03 would be Jordan 's final season , tributes were paid to him thro...
------------------------------------------------------------------------------------------
Rank 2: Michael Jordan (0.7971195354652177)
In an injury-plagued 2001 – 02 season , he led the team in scoring ( 22.9 ppg ) , assists ( 5.2 apg ...
------------------------------------------------------------------------------------------
Rank 3: Michael Jordan (0.7867602798002393)
Jordan led the league with 28.7 points per game , securing his fifth regular-season MVP award , plus...
------------------------------------------------------------------------------------------


In [14]:
%%timeit
query = "How many points did Michael Jordan actually score in his final NBA game?"
results = index.as_retriever(similarity_top_k=3).retrieve(query)

68.2 ms ± 8.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# Vectorstore / Vector DB

We use LanceDB instead of the default index used above. As described [here](https://lancedb.github.io/lancedb/ann_indexes/), LanceDB uses a disk-based IVF-PQ index. As they note in the same page, this is usually only necessary when you have 100k+ samples. 

In [15]:
# https://docs.llamaindex.ai/en/stable/examples/vector_stores/LanceDBIndexDemo/
from llama_index.vector_stores.lancedb import LanceDBVectorStore
from llama_index.core import StorageContext

# Create your DB locally
vector_store = LanceDBVectorStore(
    uri="./lancedb", table_name="test"
)
# Link to the collection on llamaindex
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [16]:
# Embed and index
vdb_index = VectorStoreIndex(nodes, embed_model=embedding_model, storage_context=storage_context, show_progress=True)

Generating embeddings:   0%|          | 0/837 [00:00<?, ?it/s]

[2024-06-07T17:41:11Z WARN  lance::dataset] No existing dataset at /Users/akashsaravanan/Downloads/GenAI Bootcamp/genai-bootcamp/notebooks/lancedb/test.lance, it will be created


In [17]:
# Not necessary for this example but load the index from disk like so
vector_store = LanceDBVectorStore(
    uri="./lancedb", table_name="test"
)
vdb_index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embedding_model,
)

### Query the Index

In [18]:
query = "How many points did Michael Jordan actually score in his final NBA game?"
results = vdb_index.as_retriever(similarity_top_k=3).retrieve(query)

print(f"Query: {query}")
print("---" * 30)
for i, result in enumerate(results):
    print(f"Rank {i+1}: {result.metadata['title']} ({result.score})")
    print(result.text[:100] + "...")
    print("---" * 30)

Query: How many points did Michael Jordan actually score in his final NBA game?
------------------------------------------------------------------------------------------
Rank 1: Michael Jordan (0.6666018962860107)
With the recognition that 2002 – 03 would be Jordan 's final season , tributes were paid to him thro...
------------------------------------------------------------------------------------------
Rank 2: Michael Jordan (0.6664695143699646)
In an injury-plagued 2001 – 02 season , he led the team in scoring ( 22.9 ppg ) , assists ( 5.2 apg ...
------------------------------------------------------------------------------------------
Rank 3: Michael Jordan (0.6528033018112183)
Jordan led the league with 28.7 points per game , securing his fifth regular-season MVP award , plus...
------------------------------------------------------------------------------------------


In [19]:
%%timeit
query = "How many points did Michael Jordan actually score in his final NBA game?"
results = index.as_retriever(similarity_top_k=3).retrieve(query)

42.9 ms ± 2.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Check Index Disk Size

In [20]:
from pathlib import Path

def get_size(folder: str) -> int:
    return sum(p.stat().st_size for p in Path(folder).rglob('*'))

In [25]:
print(f"Index is {get_size('lancedb') / (1024 ** 3):.4f} GB")

Index is 2.2392 GB
