# Vector Embedding

The purpose of this worksheet is to gain familiarity with vector embeddings and 
gaining an intuition about vector space.
we make use of chromadb which is a lightweight vector database for these examples


In [1]:
#initial setup
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.utils import OPENAI_API_KEY

In [2]:
"""
For this worksheet we have created a wrapper around chromadb to simplify the worksheet
TODO: take a look at the source to see how it is implemented
"""

from typing import List
from src.chroma_db import VectorCollection, OpenAIEmbeddingModel, get_chromadb_client, remove_collection

SCHEMA_NAME = "worksheet_vector_embedding"
chroma_client = get_chromadb_client(SCHEMA_NAME)

def load_example_text()->str:
    with open("example_document.txt", "r") as file:
        txt = file.read()
    return txt


In [None]:
"""
TODO: implement a chunking approach in this cell of your choosing and run the embedding process
this chunker will then be used to populate a vector collection.
in the subsequent cells we will provide tools to analysis the collection
"""

def chunk_text(text: str)->List[str]:
    # TODO: replace the chunking function with one of your chosing - starting with a simple fixed length chunker as place holder
    clean_text = text.replace("\n", " ").replace("\r", " ")
    return  [clean_text[i:i + 100] for i in range(0, len(clean_text), 100)]

chunks = chunk_text(load_example_text())

# remove empty chunks
chunks = [chunk for chunk in chunks if chunk]

print(f"split example into {len(chunks)} chunks")

for index,chunk in enumerate(chunks):
    print(f"chunk {index}: {chunk}")

In [None]:
"""
TODO: run cell -> to populate vector store with the chunks created in the cell above
"""

COLLECTION_USER_EXPERIMENTAL_NAME = "user_experimental"


def drop_and_populate_collection(collection: VectorCollection, chunks: List[str], chroma_client)->None:
    remove_collection(chroma_client, COLLECTION_USER_EXPERIMENTAL_NAME)
    print(f"going to embed {len(chunks)} chunks")
    for index, chunk in enumerate(chunks):
        print(f"adding chunk {index}")
        collection.add_item(chunk, f"id_{index}")

collection_user_experimental = VectorCollection(COLLECTION_USER_EXPERIMENTAL_NAME,
                                                chroma_client,
                                                OPENAI_API_KEY,
                                                embedding_model=OpenAIEmbeddingModel.ADA_002)

drop_and_populate_collection(collection_user_experimental, chunks, chroma_client)


In [None]:
"""
TODO: explore your vector database with different search terms, tweak your chunking function until you are happy with the retrieval
"""
search_term = "what are the typical foods eaten in belgium"

closest_items = collection_user_experimental.similar_items(search_term, n_results=5)

for item in closest_items:
    print(f"distance_to_search_term: {item.distance}, id: {item.id} ,item: {item.text}")




In [None]:
# TODO (Optional)
# visualize the vector database contents