In [1]:
import chromadb
from chromadb.api.types import Embeddings, Metadata
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
from chromadb.config import Settings

In [3]:
client = chromadb.Client(
    Settings(chroma_db_impl="postgres", postgres_username="postgres", 
                                postgres_password="postgres", 
                                postgres_hostname="localhost", 
                                postgres_port="5445", 
                                postgres_databasename="postgres")
    )

# Everything that works with the client below should work with the above client
# client = chromadb.Client()

In [4]:
collection = client.create_collection("all-my-documents", get_or_create=True)
collection2 = client.create_collection("empty-collection", get_or_create=True)
collection3 = client.get_or_create_collection("extra-documents")

In [4]:
# list all collections
print(client.list_collections())
# get an existing collection
print(client.get_collection("all-my-documents"))

[Collection(name=all-my-documents), Collection(name=empty-collection), Collection(name=extra-documents)]
name='all-my-documents' id=UUID('00724d6b-1ec3-4bd2-abf2-ced99ee43503') metadata=None


In [4]:
print(collection.count())
print(collection2.count())

2
0


In [5]:
# delete a collection that doesn't exist
try:
    client.delete_collection("dne-connection")
except ValueError:
    print("Collection doesn't exist and throws exception as expected")

Collection doesn't exist and throws exception as expected


In [9]:
# delete a collection that exists
client.delete_collection("empty-collection")

ValueError: Collection empty-collection does not exist

In [10]:
# collection.modify(name="updated-documents", metadata={"test": "metadata-test-success"})

InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


In [5]:
# Add docs to the collection. Can also update and delete. Row-based API coming soon!
collection.add(
    documents=["This is document1", "This is document2"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
    metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
    ids=["doc1", "doc2"], # unique for each doc
)

IDAlreadyExistsError: IDs ['doc1', 'doc2'] already exist in collection 2a48e5df-f275-4915-b97f-82805e8c2536

In [5]:
collection.count()

2

In [6]:
# upsert items. new items will be added, existing items will be updated.
collection.upsert(
    ids=["doc1", "doc2", "doc3"],
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}],
    documents=["doc1", "doc2", "doc3"],
)

IDAlreadyExistsError: IDs ['doc1', 'doc2'] already exist in collection 2a48e5df-f275-4915-b97f-82805e8c2536

In [5]:
from pypika import Query, Table

pg_embeddings_table = Table("embeddings711")
query = Query.from_(pg_embeddings_table).select("*").limit(10).orderby("%s", "%s").where(pg_embeddings_table.uuid.isin([1,2,3]))

In [18]:
import re
pg_embeddings_table = Table("embeddings711")
query = (
    Query.from_(pg_embeddings_table)
    .select("*")
    .limit(10)
    .where(pg_embeddings_table.uuid.isin(['12341231-12341234-123412343-32342341234123']))
)
embeddings = [[1,2,3], [4,5,6]]
if embeddings is not None:
    for embedding in embeddings:
        query = query.orderby(
            f"embedding <=> '{embedding}'"
        )

print(query)
split_query = re.split('ORDER BY|LIMIT', str(query))
split_query[1] = split_query[1].replace('"', '').replace('embedding', '"embedding"')
query = split_query[0] + 'ORDER BY' + split_query[1] + 'LIMIT' + split_query[2]
print(query)


SELECT * FROM "embeddings711" WHERE "uuid" IN ('12341231-12341234-123412343-32342341234123') ORDER BY "embedding <=> '[1, 2, 3]'","embedding <=> '[4, 5, 6]'" LIMIT 10
SELECT * FROM "embeddings711" WHERE "uuid" IN ('12341231-12341234-123412343-32342341234123') ORDER BY "embedding" <=> '[1, 2, 3]',"embedding" <=> '[4, 5, 6]' LIMIT 10


In [8]:
import regex as re

text = 'SELECT * FROM table_name ORDER BY column_name, "field1", "field2" LIMIT 10'
pattern = r'(?<=ORDER BY.*?)"(.*?)(?=".*?LIMIT)'

substitution = '-'

result = re.sub(pattern, substitution, text)
print(result)

SELECT * FROM table_name ORDER BY column_name, ---" LIMIT 10
