In [1]:
# Use ChromaDB for storing some vectors
# see https://docs.trychroma.com/guides
# - embedding
# - similarity / distance function
# - metadata
# - collections

In [2]:
import chromadb
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()

# client = chromadb.PersistentClient(path="/path/to/save/to")

In [3]:
from chromadb.utils import embedding_functions
embed_fn = embedding_functions.DefaultEmbeddingFunction()
embed_fn

<chromadb.utils.embedding_functions.onnx_mini_lm_l6_v2.ONNXMiniLM_L6_V2 at 0x11f864bd0>

In [4]:
inf_chunk_1 = """A vector database, vector store or vector search engine is a database that can store vectors 
(fixed-length lists of numbers) along with other data items. Vector databases typically implement one or more 
Approximate Nearest Neighbor algorithms,so that one can search the database with a query vector to retrieve the 
closest matching database records."""

inf_chunk_2 = """Vectors are mathematical representations of data in a high-dimensional space. In this space, 
each dimension corresponds to a feature of the data, with the number of dimensions ranging from a few hundred to 
tens of thousands, depending on the complexity of the data being represented. A vector's position in this space 
represents its characteristics. Words, phrases, or entire documents, as well as images, audio, and other types of data, 
can all be vectorized."""

inf_chunk_3 = """The Moon is Earth's only natural satellite. It orbits at an average distance of 384,400 km (238,900 mi), 
about 30 times the diameter of Earth. Tidal forces between Earth and the Moon have synchronized the Moon's orbital period 
(lunar month) with its rotation period (lunar day) at 29.5 Earth days, causing the same side of the Moon to always face Earth.  
"""

vector = embed_fn([inf_chunk_1])
print(type(vector))
print(type(vector[0]))
print(vector[0].shape)

<class 'list'>
<class 'numpy.ndarray'>
(384,)


In [8]:
import numpy as np

vector_1 = embed_fn([inf_chunk_1])[0]
vector_2 = embed_fn([inf_chunk_2])[0]
vector_3 = embed_fn([inf_chunk_3])[0]

def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    return  dot_product / (np.linalg.norm(v1) * np.linalg.norm(v2))


print(np.linalg.norm(vector_1))
print(np.linalg.norm(vector_2))
print(np.dot(vector_1, vector_2))
print(np.linalg.norm(vector_1 - vector_2))
print(cosine_similarity(vector_1, vector_2))

0.99999994
0.99999994
0.68767667
0.79034597
0.6876767


In [11]:
vectors = [vector_1, vector_2, vector_3]

for i, v1 in enumerate(vectors):
    for j, v2 in enumerate(vectors):
        print(f"({i+1},{j+1})",cosine_similarity(v1, v2), np.linalg.norm(v1 - v2))

(1,1) 1.0000001 0.0
(1,2) 0.6876767 0.79034597
(1,3) -0.008222253 1.4200157
(2,1) 0.6876767 0.79034597
(2,2) 1.0000001 0.0
(2,3) 0.019836674 1.4001166
(3,1) -0.008222253 1.4200157
(3,2) 0.019836674 1.4001166
(3,3) 1.0 0.0


In [13]:
# Create a collection
try:
    client.delete_collection(name="my_collection")
except Exception as e:
   print(e)
    
collection = client.create_collection(name="my_collection", 
                                      embedding_function=embed_fn, 
                                      metadata={"hnsw:space": "cosine"} # l2 is the default
                                     )
collection = client.get_collection(name="my_collection", embedding_function=embed_fn)

In [14]:
import uuid

ids = []
for i  in range(3):
    ids.append(str(uuid.uuid4()))

collection.add(
    documents=[inf_chunk_1, inf_chunk_2, inf_chunk_3],
    metadatas=[
        {"wikipedia": "Vector database", "url": "https://en.wikipedia.org/wiki/Vector_database"}, 
        {"wikipedia": "Vector database", "url": "https://en.wikipedia.org/wiki/Vector_database"}, 
        {"wikipedia": "Moon", "url": "https://en.wikipedia.org/wiki/Moon"}],
    ids=ids
)

In [15]:
question = "Tell me something about the moon."
question_embeddings = embed_fn([question])


result = collection.query(
    query_embeddings=question_embeddings,
    n_results=2,
)

In [16]:
result #distance = 1 - cosine_similarity !

{'ids': [['e90cc383-7d13-460e-ab68-87e0cb25c15d',
   'cfdc6314-19cd-4ebc-ada4-8c2a7b5d4d27']],
 'embeddings': None,
 'documents': [["The Moon is Earth's only natural satellite. It orbits at an average distance of 384,400 km (238,900 mi), \nabout 30 times the diameter of Earth. Tidal forces between Earth and the Moon have synchronized the Moon's orbital period \n(lunar month) with its rotation period (lunar day) at 29.5 Earth days, causing the same side of the Moon to always face Earth.  \n",
   "Vectors are mathematical representations of data in a high-dimensional space. In this space, \neach dimension corresponds to a feature of the data, with the number of dimensions ranging from a few hundred to \ntens of thousands, depending on the complexity of the data being represented. A vector's position in this space \nrepresents its characteristics. Words, phrases, or entire documents, as well as images, audio, and other types of data, \ncan all be vectorized."]],
 'uris': None,
 'data': No

In [18]:
result = collection.query(
    query_embeddings=question_embeddings,
    n_results=2,
    where={"wikipedia": "Vector database"}
)

In [19]:
result

{'ids': [['cfdc6314-19cd-4ebc-ada4-8c2a7b5d4d27',
   'f729bfe1-f519-47be-be0c-2ab44e76592b']],
 'embeddings': None,
 'documents': [["Vectors are mathematical representations of data in a high-dimensional space. In this space, \neach dimension corresponds to a feature of the data, with the number of dimensions ranging from a few hundred to \ntens of thousands, depending on the complexity of the data being represented. A vector's position in this space \nrepresents its characteristics. Words, phrases, or entire documents, as well as images, audio, and other types of data, \ncan all be vectorized.",
   'A vector database, vector store or vector search engine is a database that can store vectors \n(fixed-length lists of numbers) along with other data items. Vector databases typically implement one or more \nApproximate Nearest Neighbor algorithms,so that one can search the database with a query vector to retrieve the \nclosest matching database records.']],
 'uris': None,
 'data': None,
 '