# Experimenting with ChromaDB to store vectors locally

While Pinecone is great and all, for smaller experiments in non-production settings, it can be a better idea to store data locally in ChromaDB

In [1]:
import chromadb
from openai import OpenAI
from dotenv import load_dotenv
import os
import numpy as np
import uuid

In [2]:
load_dotenv()

True

In [3]:
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [4]:
texts = [
    "Hello, this is my first text that I am embedding and inserting into Chroma",
    "And what do you know, this is already my second text that I am embedding and inserting into Chroma",
    "But I will have to stop at my third text though",
    "Fourth for sure, no more"
]

query = "Chroma"

### Create client and database collection

In [5]:
PERSISTENCE_PATH = "db"
COLLECTION_NAME = "collection"

In [6]:
client = chromadb.PersistentClient(
    path = PERSISTENCE_PATH
)

In [7]:
if COLLECTION_NAME in [c.name for c in client.list_collections()]:
    client.delete_collection(COLLECTION_NAME)
    

collection = client.create_collection(
        name = COLLECTION_NAME
    )

### Embed and store in collection

In [8]:
llm = OpenAI(
    api_key = OPENAI_API_KEY
)

In [9]:
response = llm.embeddings.create(
    model="text-embedding-ada-002",
    input=texts
)

embeddings = np.array([item.embedding for item in response.data])

In [10]:
data = [
    {
        "file_type": "LMAO",
        "file_name": "ROFL",
        "marker": np.random.randint(10),
        "sub_marker": np.random.randint(10),
        "first_10_tokens": text,
        "text": text,
        "embedding": embedding
    }
    for embedding, text in zip(embeddings, texts)
]

In [11]:
data

[{'file_type': 'LMAO',
  'file_name': 'ROFL',
  'marker': 8,
  'sub_marker': 2,
  'first_10_tokens': 'Hello, this is my first text that I am embedding and inserting into Chroma',
  'text': 'Hello, this is my first text that I am embedding and inserting into Chroma',
  'embedding': array([-0.01671784,  0.01247183, -0.03452713, ..., -0.02206861,
         -0.02406517, -0.01104762], shape=(1536,))},
 {'file_type': 'LMAO',
  'file_name': 'ROFL',
  'marker': 9,
  'sub_marker': 0,
  'first_10_tokens': 'And what do you know, this is already my second text that I am embedding and inserting into Chroma',
  'text': 'And what do you know, this is already my second text that I am embedding and inserting into Chroma',
  'embedding': array([-0.01114542,  0.00661219, -0.01784064, ..., -0.02272921,
         -0.0260901 , -0.01760152], shape=(1536,))},
 {'file_type': 'LMAO',
  'file_name': 'ROFL',
  'marker': 0,
  'sub_marker': 3,
  'first_10_tokens': 'But I will have to stop at my third text though',
  

In [12]:
def _prepare_data_for_upsert(data):
    """
    Prepares data in the format required by Chroma:
    - id: unique ID
    - embedding: list of floats
    - metadata: dictionary with file info
    - document/text: the actual text content
    """
    prepared_data = []
    for item in data:
        embedding = list(item['embedding'])  # ensure list format
        metadata = {
            "file_type": item['file_type'],
            "file_name": item['file_name'],
            "marker": str(item['marker']),
            "sub_marker": str(item['sub_marker']),
            "first_10_tokens": item['first_10_tokens'],
            "text": item["text"]
        }
        prepared_data.append({
            "id": str(uuid.uuid4()),
            "embedding": embedding,
            "metadata": metadata,
        })
    return prepared_data

In [13]:
prepared_data = _prepare_data_for_upsert(data)
prepared_data

[{'id': '800a3905-5a7c-448f-b12d-a8902ec1d2ef',
  'embedding': [np.float64(-0.016717838123440742),
   np.float64(0.012471827678382397),
   np.float64(-0.03452713042497635),
   np.float64(-0.018075497820973396),
   np.float64(0.007992884144186974),
   np.float64(0.029974978417158127),
   np.float64(-0.013669761829078197),
   np.float64(-0.015453352592885494),
   np.float64(-0.010501890443265438),
   np.float64(0.00413952860981226),
   np.float64(0.011094203218817711),
   np.float64(0.014095693826675415),
   np.float64(-0.02471737749874592),
   np.float64(0.00781984906643629),
   np.float64(0.025010205805301666),
   np.float64(-0.00851864367723465),
   np.float64(0.01113413367420435),
   np.float64(-0.00033026383607648313),
   np.float64(0.0036603547632694244),
   np.float64(-0.012505102902650833),
   np.float64(-0.02543613873422146),
   np.float64(0.0029549046885222197),
   np.float64(0.02116350643336773),
   np.float64(0.005740101914852858),
   np.float64(-0.020644402131438255),
   np.

In [14]:
def store(client, collection, data):
    """
    Stores a list of documents into Chroma.
    """
    prepared_data = _prepare_data_for_upsert(data)
    collection.add(
        ids=[d["id"] for d in prepared_data],
        embeddings=[d["embedding"] for d in prepared_data],
        metadatas=[d["metadata"] for d in prepared_data],
    )
    return

In [15]:
store(client, collection, data)

In [16]:
def query_top_k(collection, query_embedding, k):
    """
    Queries the top K nearest neighbors for a given embedding.
    """
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=k,
        include=["metadatas", "distances"]
    )
    docs = [
        {
            "metadata": metadata,
            "score": 1 - distance
        } for metadata, distance in zip(results["metadatas"][0], results["distances"][0])
    ]
    return docs

In [17]:
response = llm.embeddings.create(
    model="text-embedding-ada-002",
    input=query
)
query_embedding = response.data[0].embedding

In [18]:
print(query_top_k(collection, query_embedding, 2))

[{'metadata': {'text': 'Hello, this is my first text that I am embedding and inserting into Chroma', 'marker': '8', 'sub_marker': '2', 'file_name': 'ROFL', 'file_type': 'LMAO', 'first_10_tokens': 'Hello, this is my first text that I am embedding and inserting into Chroma'}, 'score': 0.6748298406600952}, {'metadata': {'sub_marker': '0', 'marker': '9', 'text': 'And what do you know, this is already my second text that I am embedding and inserting into Chroma', 'first_10_tokens': 'And what do you know, this is already my second text that I am embedding and inserting into Chroma', 'file_type': 'LMAO', 'file_name': 'ROFL'}, 'score': 0.6341855823993683}]
