# Vector Embedding

The purpose of this worksheet is to gain familiarity with vector embeddings and 
gaining an intuition about vector space.
we make use of chromadb which is a lightweight vector database for these examples


In [1]:
#initial setup
import sys
import os

project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.utils import OPENAI_API_KEY

In [3]:
"""
For this worksheet we have created a wrapper around chromadb to simplify the worksheet
TODO: take a look at the source to see how it is implemented
"""

from typing import List
from src.chroma_db import VectorCollection, OpenAIEmbeddingModel, get_chromadb_client, remove_collection

SCHEMA_NAME = "worksheet_vector_embedding"
chroma_client = get_chromadb_client(SCHEMA_NAME)

def load_example_text()->str:
    with open("example_document.txt", "r") as file:
        txt = file.read()
    return txt


In [4]:
"""
TODO: implement a chunking approach in this cell of your choosing and run the embedding process
this chunker will then be used to populate a vector collection.
in the subsequent cells we will provide tools to analysis the collection
"""

def chunk_text(text: str)->List[str]:
    # TODO: replace the chunking function with one of your chosing - starting with a simple fixed length chunker as place holder
    clean_text = text.replace("\n", " ").replace("\r", " ")
    return  [clean_text[i:i + 100] for i in range(0, len(clean_text), 100)]

chunks = chunk_text(load_example_text())

# remove empty chunks
chunks = [chunk for chunk in chunks if chunk]

print(f"split example into {len(chunks)} chunks")

for index,chunk in enumerate(chunks):
    print(f"chunk {index}: {chunk}")

split example into 50 chunks
chunk 0: Alex arrived in Belgium with a sense of nervous excitement. The journey from the airport was a seaml
chunk 1: ess transition, a small comfort after the long flight. The train station beneath Brussels Airport wa
chunk 2: s a bustling hub of modern efficiency, a stark contrast to the medieval cities he was soon to explor
chunk 3: e. He found his platform and soon, the intercity train was gliding him past a landscape of neat bric
chunk 4: k houses and manicured fields, a prelude to the organized charm he was beginning to expect.  His fir
chunk 5: st afternoon in Brussels was a full-sensory experience. After settling into his guesthouse, he let t
chunk 6: he city's sounds guide him. The murmur of multilingual conversations and the distant chiming of bell
chunk 7: s led him through narrow, winding cobblestone streets. When he emerged into the Grand-Place, the air
chunk 8:  seemed to hum with history. The intricate golden facades of the guildhalls and th

In [5]:
"""
TODO: run cell -> to populate vector store with the chunks created in the cell above
"""

COLLECTION_USER_EXPERIMENTAL_NAME = "user_experimental"

def drop_and_populate_collection(collection_name: str, documents: List[str],
                                 chroma_client) -> VectorCollection:
    remove_collection(chroma_client, collection_name)

    collection = VectorCollection(
        COLLECTION_USER_EXPERIMENTAL_NAME,
        chroma_client,
        OPENAI_API_KEY,  #type: ignore
        embedding_model=OpenAIEmbeddingModel.ADA_002)

    print(f"going to embed {len(documents)} documents")
    for index, document in enumerate(documents):
        print(f"adding document {index}")
        collection.add_item(document, f"id_{index}")

    return collection

In [6]:
"""
TODO: explore your vector database with different search terms, tweak your chunking function until you are happy with the retrieval
"""


collection_user_experimental = drop_and_populate_collection(COLLECTION_USER_EXPERIMENTAL_NAME, chunks, chroma_client)

search_term = "what are the typical foods eaten in belgium"
closest_items = collection_user_experimental.similar_items(search_term, n_results=5)

for item in closest_items:
    print(f"distance_to_search_term: {item.distance}, id: {item.id} ,item: {item.text}")




Collection user_experimental does not exist nothing to remove
going to embed 50 documents
adding document 0
adding document 1
adding document 2
adding document 3
adding document 4
adding document 5
adding document 6
adding document 7
adding document 8
adding document 9
adding document 10
adding document 11
adding document 12
adding document 13
adding document 14
adding document 15
adding document 16
adding document 17
adding document 18
adding document 19
adding document 20
adding document 21
adding document 22
adding document 23
adding document 24
adding document 25
adding document 26
adding document 27
adding document 28
adding document 29
adding document 30
adding document 31
adding document 32
adding document 33
adding document 34
adding document 35
adding document 36
adding document 37
adding document 38
adding document 39
adding document 40
adding document 41
adding document 42
adding document 43
adding document 44
adding document 45
adding document 46
adding document 47
adding d

In [None]:
# TODO (Optional)
# visualize the vector database contents