From cdb588b7cc5d05dd093d1e39ccb5d1f1f4efaa2a Mon Sep 17 00:00:00 2001 From: Hammad Bashir Date: Thu, 10 Aug 2023 14:59:12 -0700 Subject: [PATCH] [PERF] Make the index correctly use FTS (#958) ## Description of changes Previously we were not using the FTS search index correctly. https://sqlite.org/fts5.html#full_text_query_syntax Expects that you query using the table name of the FTS table, not using the column name. If you want to query by column name, you have to use column filters as discussed in the link above. We opt to take the path suggested here https://sqlite.org/forum/forumpost/1d45a7f6e17a3460 and match on id in addition to filtering that specific column. The query planner leverages this appropriately as confirmed in EXPLAIN. Since we were doing speculative delete queries, assuming the index was leveraged, this was incredibly slow. However now it is much faster. Explain Before ```-- SCAN VIRTUAL TABLE INDEX 0:``` -> Full table scan. Explain After ``` -- SCAN VIRTUAL TABLE INDEX 0:M2 ``` -> Scans the index itself The net effect of this is a large increase in write speed and also now the write path time does not grow with table size. ### Quick Benchmark Results N = 100k uniformly random vectors D = 128 Metadata = one small key: value pair Document = randomly generated string of length 100 Added with batch size = 1000 **Without Fix, Overall Time = 469s. Time to add a batch grows linearly to >8000 ms** Screenshot 2023-08-09 at 5 53 24 PM **With Fix, Overall Time = 102s. Time to add a batch grows sublinearly to ~1200 ms** Screenshot 2023-08-09 at 5 43 12 PM We will also want to make sure that the read path leverages this way of querying. Will address that in a follow up PR. ## Test plan Existing tests cover the scope of this change. ## Documentation Changes None required. --- chromadb/segment/impl/metadata/sqlite.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/chromadb/segment/impl/metadata/sqlite.py b/chromadb/segment/impl/metadata/sqlite.py index 93027d091f9..3f770869fcc 100644 --- a/chromadb/segment/impl/metadata/sqlite.py +++ b/chromadb/segment/impl/metadata/sqlite.py @@ -22,7 +22,7 @@ WhereOperator, ) from uuid import UUID -from pypika import Table, Tables +from pypika import Table, Tables, Field from pypika.queries import QueryBuilder import pypika.functions as fn from pypika.terms import Criterion, Function @@ -140,8 +140,6 @@ def get_metadata( q = q.where( self._where_doc_criterion(q, where_document, embeddings_t, fulltext_t) ) - pass - # q = self._where_document_query(q, where_document, embeddings_t, fulltext_t) if ids: q = q.where(embeddings_t.embedding_id.isin(ParameterValue(ids))) @@ -247,6 +245,7 @@ def _update_metadata(self, cur: Cursor, id: int, metadata: UpdateMetadata) -> No self._db.querybuilder() .from_(t) .where(t.id == ParameterValue(id)) + .where(Field(t.get_table_name()) == ParameterValue(id)) .delete() ) sql, params = get_sql(q)