Add autocut feature for trimming search results

cnmoro · Mar 9, 2024 · 0825ea6 · 0825ea6
1 parent 1ecc67c
commit 0825ea6
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 4 deletions.
diff --git a/minivectordb/vector_database.py b/minivectordb/vector_database.py
@@ -187,7 +187,30 @@ def hybrid_rerank_results(self, sentences, search_scores, query, k=5, weights=(0
         # Trim results to requested k
         return sentences[:k], combined_scores[:k]
 
-    def find_most_similar(self, embedding, metadata_filter={}, exclude_filter=None, or_filters=None, k=5):
+    def autocut_scores(self, score_list):
+        """
+        This function takes a list of scores and determines if there is a significant drop in the scores.
+        If there is a drop greater than 20%, it returns the indices of the scores that would be removed.
+        If there is no such drop, it returns an empty list.
+
+        Inspired by weaviate's golden ragtriever autocut feature.
+        This is a basic implementation and can be improved.
+        """
+        # Find the percentage of each score decrease
+        score_decreases = []
+        for i in range(1, len(score_list)):
+            score_decreases.append((score_list[i-1] - score_list[i]) / score_list[i-1])
+
+        # Find the highest score decrease
+        max_score_decrease = max(score_decreases)
+
+        if max_score_decrease > 0.2:
+            # Return all the indexes that would be removed if cut at the index of the max_score_decrease
+            return list(range(score_decreases.index(max_score_decrease) + 1, len(score_list)))
+
+        return []
+
+    def find_most_similar(self, embedding, metadata_filter={}, exclude_filter=None, or_filters=None, k=5, autocut=False):
         """ or_filters could be a list of dictionaries, where each dictionary contains key-value pairs for OR filters.
         or it could be a single dictionary, which will be equivalent to a list with a single dictionary."""
         embedding = self._convert_ndarray_float32(embedding)
@@ -236,6 +259,14 @@ def find_most_similar(self, embedding, metadata_filter={}, exclude_filter=None,
         # Unzip the results into separate lists
         ids, distances, metadatas = zip(*found_results) if found_results else ([], [], [])
 
+        if autocut:
+            # Remove results that are not within 20% of the best result
+            remove_indexes = self.autocut_scores(distances)
+            if remove_indexes:
+                ids = [ids[i] for i in range(len(ids)) if i not in remove_indexes]
+                distances = [distances[i] for i in range(len(distances)) if i not in remove_indexes]
+                metadatas = [metadatas[i] for i in range(len(metadatas)) if i not in remove_indexes]
+
         return ids, distances, metadatas
 
     def persist_to_disk(self):

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "minivectordb"
-version = "1.2.2"
+version = "1.3.0"
 authors = [
   { name="Carlo Moro", email="cnmoro@gmail.com" },
 ]

diff --git a/setup.py b/setup.py
@@ -35,7 +35,7 @@
 
 setup(
     name='minivectordb',
-    version='1.2.2',
+    version='1.3.0',
     author='Carlo Moro',
     author_email='cnmoro@gmail.com',
     description="This is a Python project aimed at extracting embeddings from textual data and performing semantic search.",

diff --git a/tests/test_vector_database.py b/tests/test_vector_database.py
@@ -247,7 +247,8 @@ def test_similarity_search_with_hybrid_reranking():
     sentences = [
         (1, 'i like animals'),
         (2, 'i like cars'),
-        (3, 'i like programming')
+        (3, 'i like programming'),
+        (4, 'technology is the future')
     ]
 
     for id, sentence in sentences:
@@ -272,6 +273,27 @@ def test_similarity_search_with_hybrid_reranking():
     assert 1 in ids
     assert 2 in ids
 
+    # Now, try to find the 4 best matches, but using the autocut parameter
+    query = "technology rocks"
+    query_embedding = model.extract_embeddings(query)
+    ids, distances, _ = db.find_most_similar(query_embedding, k=4, autocut=True)
+
+    # Assert that only the 4th sentence is returned
+    assert len(ids) == 1
+    assert ids[0] == 4
+
+    # Now test the autocut again, but in a case where no sentence is ignored
+    query = "animals, cars, programming, technology"
+    query_embedding = model.extract_embeddings(query)
+    ids, distances, _ = db.find_most_similar(query_embedding, k=4, autocut=True)
+
+    # Assert that all sentences are returned
+    assert len(ids) == 4
+    assert 1 in ids
+    assert 2 in ids
+    assert 3 in ids
+    assert 4 in ids
+
 def test_unique_id_validation():
     db = VectorDatabase()