In [None]:
# Use ChromaDB for storing some vectors
# see https://docs.trychroma.com/guides
# - embedding
# - similarity / distance function
# - metadata
# - collections

In [None]:
import chromadb
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()

# client = chromadb.PersistentClient(path="/path/to/save/to")

In [None]:
from chromadb.utils import embedding_functions
embed_fn = embedding_functions.DefaultEmbeddingFunction()
embed_fn

In [None]:
inf_chunk_1 = """A vector database, vector store or vector search engine is a database that can store vectors 
(fixed-length lists of numbers) along with other data items. Vector databases typically implement one or more 
Approximate Nearest Neighbor algorithms,so that one can search the database with a query vector to retrieve the 
closest matching database records."""

inf_chunk_2 = """Vectors are mathematical representations of data in a high-dimensional space. In this space, 
each dimension corresponds to a feature of the data, with the number of dimensions ranging from a few hundred to 
tens of thousands, depending on the complexity of the data being represented. A vector's position in this space 
represents its characteristics. Words, phrases, or entire documents, as well as images, audio, and other types of data, 
can all be vectorized."""

inf_chunk_3 = """The Moon is Earth's only natural satellite. It orbits at an average distance of 384,400 km (238,900 mi), 
about 30 times the diameter of Earth. Tidal forces between Earth and the Moon have synchronized the Moon's orbital period 
(lunar month) with its rotation period (lunar day) at 29.5 Earth days, causing the same side of the Moon to always face Earth.  
"""

vector = embed_fn([inf_chunk_1])
print(type(vector))
print(type(vector[0]))
print(vector[0].shape)

In [None]:
import numpy as np

vector_1 = embed_fn([inf_chunk_1])[0]
vector_2 = embed_fn([inf_chunk_2])[0]
vector_3 = embed_fn([inf_chunk_3])[0]

def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    return  dot_product / (np.linalg.norm(v1) * np.linalg.norm(v2))

cosine_similarity(vector_1, vector_2)

In [None]:
vectors = [vector_1, vector_2, vector_3]

for v1 in vectors:
    for v2 in vectors:
        print(cosine_similarity(v1, v2))

In [None]:
# Create a collection
client.delete_collection(name="my_collection") 
collection = client.create_collection(name="my_collection", 
                                      embedding_function=embed_fn, 
                                      metadata={"hnsw:space": "cosine"} # l2 is the default
                                     )
collection = client.get_collection(name="my_collection", embedding_function=embed_fn)

In [None]:
import uuid

ids = []
for i  in range(3):
    ids.append(str(uuid.uuid4()))

collection.add(
    documents=[inf_chunk_1, inf_chunk_2, inf_chunk_3],
    metadatas=[
        {"wikipedia": "Vector database", "url": "https://en.wikipedia.org/wiki/Vector_database"}, 
        {"wikipedia": "Vector database", "url": "https://en.wikipedia.org/wiki/Vector_database"}, 
        {"wikipedia": "Moon", "url": "https://en.wikipedia.org/wiki/Moon"}],
    ids=ids
)

In [None]:
query = "Tell me something about th moon."
query_embeddings = embed_fn([query])


result = collection.query(
    query_embeddings=query_embeddings,
    n_results=2,
)

In [None]:
result #distance = 1 - cosine_similarity !

In [None]:
result = collection.query(
    query_embeddings=query_embeddings,
    n_results=2,
    where={"wikipedia": "Vector database"}
)

In [None]:
result