In [1]:
import tqdm
import numpy as np

# API Setup

In [2]:
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

True

# Dataset

In [3]:
from datasets import load_dataset
data = load_dataset("EleutherAI/wikitext_document_level", "wikitext-103-raw-v1", trust_remote_code=False)

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
sample_size = 100
data_index = 38
np.random.seed(42)
data = data["train"][:sample_size]

# Preprocessing

In [5]:
# Data cleaning
import re
pattern = re.compile(r" @(.)@ ")

# Run this across the entire dataset
for i, page in enumerate(data["page"]):
    data["page"][i] = re.sub(pattern, r"\1", page)

In [6]:
# Data enrichment
def extract_metadata(data):
    title_pattern = re.compile(r"\s=\s([^=]{1,50})\s=\s")
    title = [item for item in re.findall(title_pattern, data)]
    # The regex above isn't perfect so we take the first match as the title 
    if len(title) > 0:
        title = title[0]
    else:
        title = "Unknown Title"
    return {"title": title}

In [7]:
# Load documents
from llama_index.core import Document

documents = []
for i in tqdm.tqdm(range(len(data["page"]))):
    documents.append(
        Document(
            text=data["page"][i],
            metadata=extract_metadata(data["page"][i]),
        )
    )

100%|██████████| 100/100 [00:00<00:00, 3882.14it/s]


# Chunking

In [8]:
from llama_index.core.node_parser import SentenceSplitter
chunker = SentenceSplitter(chunk_size=512, chunk_overlap=20)

In [9]:
nodes = chunker.get_nodes_from_documents(documents, show_progress=True)

Parsing nodes:   0%|          | 0/100 [00:00<?, ?it/s]

In [10]:
print(f"Documents before chunking: {len(documents)}")
print(f"Documents after chunking: {len(nodes)}")
nodes[0]

Documents before chunking: 100
Documents after chunking: 837


TextNode(id_='aa6fdd1d-6853-4929-8670-2ad0a5e88755', embedding=None, metadata={'title': 'Valkyria Chronicles III'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='adfe13ec-a653-44a6-9623-9f6ae10355db', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'title': 'Valkyria Chronicles III'}, hash='f7aadfb478d20e04be770cd882b5e6a44c185eb28a53810838586313c39ccc7c'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='8acad8ab-5180-4bbf-9ed6-ef904ba91e72', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='32c32274359d6ec7e58a31e940b4b433c53354c2fb60611c7cc6bd2c324d075c')}, text='= Valkyria Chronicles III = \n \n Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role-playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in Jan

# Embedding

For our embedding model we use the [BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) model. This is an MIT license model which is, as of June 1st 2024, 44th on the [MTEB Retrieval leaderboard](https://huggingface.co/spaces/mteb/leaderboard). While there are better models available (as seen on the leaderboard), we choose this model for the demo as it's very small (33M parameters / ~120MB) and hence, very fast. The generated embeddings are 384-dimensional. 

Embedding is usually performed while indexing, so we compute embeddings over our dataset in the next notebook.

In [11]:
# Load embedding model
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5", embed_batch_size=32)



In [12]:
# Example of how to get an embedding and what it looks like
print(np.array(embedding_model.get_text_embedding(nodes[0].text)).shape)
np.array(embedding_model.get_text_embedding(nodes[0].text)[:20])

(384,)


array([-0.02077559, -0.03510703, -0.03307754,  0.02809924,  0.00985101,
       -0.03071311, -0.0626043 ,  0.03460966, -0.0053006 ,  0.00643425,
       -0.03491058, -0.01346424,  0.00973109,  0.01440061,  0.07204092,
       -0.05149863,  0.00747196, -0.01969269, -0.03814155,  0.00673186])