Skip to content

Commit

Permalink
Add autocut feature for trimming search results
Browse files Browse the repository at this point in the history
  • Loading branch information
Carlo Nery de Lima Moro committed Mar 9, 2024
1 parent 1ecc67c commit 0825ea6
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 4 deletions.
33 changes: 32 additions & 1 deletion minivectordb/vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,30 @@ def hybrid_rerank_results(self, sentences, search_scores, query, k=5, weights=(0
# Trim results to requested k
return sentences[:k], combined_scores[:k]

def find_most_similar(self, embedding, metadata_filter={}, exclude_filter=None, or_filters=None, k=5):
def autocut_scores(self, score_list):
"""
This function takes a list of scores and determines if there is a significant drop in the scores.
If there is a drop greater than 20%, it returns the indices of the scores that would be removed.
If there is no such drop, it returns an empty list.
Inspired by weaviate's golden ragtriever autocut feature.
This is a basic implementation and can be improved.
"""
# Find the percentage of each score decrease
score_decreases = []
for i in range(1, len(score_list)):
score_decreases.append((score_list[i-1] - score_list[i]) / score_list[i-1])

# Find the highest score decrease
max_score_decrease = max(score_decreases)

if max_score_decrease > 0.2:
# Return all the indexes that would be removed if cut at the index of the max_score_decrease
return list(range(score_decreases.index(max_score_decrease) + 1, len(score_list)))

return []

def find_most_similar(self, embedding, metadata_filter={}, exclude_filter=None, or_filters=None, k=5, autocut=False):
""" or_filters could be a list of dictionaries, where each dictionary contains key-value pairs for OR filters.
or it could be a single dictionary, which will be equivalent to a list with a single dictionary."""
embedding = self._convert_ndarray_float32(embedding)
Expand Down Expand Up @@ -236,6 +259,14 @@ def find_most_similar(self, embedding, metadata_filter={}, exclude_filter=None,
# Unzip the results into separate lists
ids, distances, metadatas = zip(*found_results) if found_results else ([], [], [])

if autocut:
# Remove results that are not within 20% of the best result
remove_indexes = self.autocut_scores(distances)
if remove_indexes:
ids = [ids[i] for i in range(len(ids)) if i not in remove_indexes]
distances = [distances[i] for i in range(len(distances)) if i not in remove_indexes]
metadatas = [metadatas[i] for i in range(len(metadatas)) if i not in remove_indexes]

return ids, distances, metadatas

def persist_to_disk(self):
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "minivectordb"
version = "1.2.2"
version = "1.3.0"
authors = [
{ name="Carlo Moro", email="cnmoro@gmail.com" },
]
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@

setup(
name='minivectordb',
version='1.2.2',
version='1.3.0',
author='Carlo Moro',
author_email='cnmoro@gmail.com',
description="This is a Python project aimed at extracting embeddings from textual data and performing semantic search.",
Expand Down
24 changes: 23 additions & 1 deletion tests/test_vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,8 @@ def test_similarity_search_with_hybrid_reranking():
sentences = [
(1, 'i like animals'),
(2, 'i like cars'),
(3, 'i like programming')
(3, 'i like programming'),
(4, 'technology is the future')
]

for id, sentence in sentences:
Expand All @@ -272,6 +273,27 @@ def test_similarity_search_with_hybrid_reranking():
assert 1 in ids
assert 2 in ids

# Now, try to find the 4 best matches, but using the autocut parameter
query = "technology rocks"
query_embedding = model.extract_embeddings(query)
ids, distances, _ = db.find_most_similar(query_embedding, k=4, autocut=True)

# Assert that only the 4th sentence is returned
assert len(ids) == 1
assert ids[0] == 4

# Now test the autocut again, but in a case where no sentence is ignored
query = "animals, cars, programming, technology"
query_embedding = model.extract_embeddings(query)
ids, distances, _ = db.find_most_similar(query_embedding, k=4, autocut=True)

# Assert that all sentences are returned
assert len(ids) == 4
assert 1 in ids
assert 2 in ids
assert 3 in ids
assert 4 in ids

def test_unique_id_validation():
db = VectorDatabase()

Expand Down

0 comments on commit 0825ea6

Please sign in to comment.