diff --git a/docs/changelog/137862.yaml b/docs/changelog/137862.yaml new file mode 100644 index 0000000000000..10de80390ecc6 --- /dev/null +++ b/docs/changelog/137862.yaml @@ -0,0 +1,5 @@ +pr: 137862 +summary: "[Vector Search] Fix wrong vector docvalue_fields" +area: Vector Search +type: bug +issues: [] diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml index 161fc23a84651..42e58061b8c9b 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml @@ -161,3 +161,77 @@ setup: - close_to: { hits.hits.2.fields.vector2.0.4: { value: -100.0, error: 0.001 } } - match: {hits.hits.2.fields.vector3.0: [-1, 100, -13, 15, -128]} + +--- +"Dense vector cosine docvalue_fields with sparse documents after force merge": + - requires: + cluster_features: [ "mapper.fix_dense_vector_wrong_fields" ] + reason: Support for dense vector doc value fields capability required + - do: + indices.create: + index: test_cosine + body: + mappings: + properties: + foo: + type: keyword + vec: + type: dense_vector + similarity: cosine + + # Create sparse vector scenario - some docs have vectors, some don't + - do: + index: + index: test_cosine + id: "1" + body: + foo: "bar" + + - do: + index: + index: test_cosine + id: "2" + body: + foo: "bar" + + - do: + index: + index: test_cosine + id: "3" + body: + vec: [1, 2] + + - do: + indices.refresh: + index: test_cosine + + - do: + index: + index: test_cosine + id: "4" + body: + foo: "bar" + + # Force merge to create the ord/docId mapping issue scenario + - do: + indices.forcemerge: + index: test_cosine + max_num_segments: 1 + + # Search with docvalue_fields to get the vector values + - do: + search: + index: test_cosine + body: + docvalue_fields: ["vec"] + query: + term: + _id: "3" + + # Verify that _id=3's vector value is correctly returned as [1, 2] + # This test verifies that the ordToDoc fix works correctly + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "3" } + - length: { hits.hits.0.fields.vec.0: 2 } + - close_to: { hits.hits.0.fields.vec.0.0: { value: 1.0, error: 0.001 } } + - close_to: { hits.hits.0.fields.vec.0.1: { value: 2.0, error: 0.001 } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java index ca344c134783b..cc318a122f76b 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java @@ -49,6 +49,7 @@ public class MapperFeatures implements FeatureSpecification { static final NodeFeature SEARCH_LOAD_PER_SHARD = new NodeFeature("mapper.search_load_per_shard"); static final NodeFeature PATTERNED_TEXT = new NodeFeature("mapper.patterned_text"); public static final NodeFeature MULTI_FIELD_UNICODE_OPTIMISATION_FIX = new NodeFeature("mapper.multi_field.unicode_optimisation_fix"); + public static final NodeFeature FIX_DENSE_VECTOR_WRONG_FIELDS = new NodeFeature("mapper.fix_dense_vector_wrong_fields"); @Override public Set getTestFeatures() { @@ -83,7 +84,8 @@ public Set getTestFeatures() { SPARSE_VECTOR_INDEX_OPTIONS_FEATURE, PATTERNED_TEXT, MULTI_FIELD_UNICODE_OPTIMISATION_FIX, - MATCH_ONLY_TEXT_BLOCK_LOADER_FIX + MATCH_ONLY_TEXT_BLOCK_LOADER_FIX, + FIX_DENSE_VECTOR_WRONG_FIELDS ); } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java index 04069333deb13..775168e69e477 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java @@ -59,6 +59,11 @@ public VectorScorer scorer(float[] floats) throws IOException { return in.scorer(floats); } + @Override + public int ordToDoc(int ord) { + return in.ordToDoc(ord); + } + public float magnitude() { return magnitude; } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java index de4ab0bc5df30..12df0bfaecaf9 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java @@ -9,8 +9,10 @@ package org.elasticsearch.index.mapper.vectors; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.VectorScorer; import org.elasticsearch.test.ESTestCase; import java.io.IOException; @@ -114,4 +116,197 @@ public long cost() { }; } + public void testOrdToDocWithSparseVectors() throws IOException { + // This test simulates a real-world scenario where some documents don't have vector fields + // After force merge, the ord (ordinal) and docId mapping becomes crucial + + // Simulate a scenario where we have 6 documents, but only documents 1, 3, 4 have vectors + // Doc 0: no vector + // Doc 1: vector [0.6, 0.8, 0.0, 0.0], magnitude 5.0 -> original [3.0, 4.0, 0.0, 0.0] + // Doc 2: no vector + // Doc 3: vector [1.0, 0.0, 0.0, 0.0], magnitude 2.0 -> original [2.0, 0.0, 0.0, 0.0] + // Doc 4: vector [0.0, 0.0, 0.6, 0.8], magnitude 10.0 -> original [0.0, 0.0, 6.0, 8.0] + // Doc 5: no vector + + // After merge, the vector ordinals will be 0, 1, 2 but they correspond to docIds 1, 3, 4 + int totalDocs = 6; + int[] docIdsWithVectors = { 1, 3, 4 }; // Document IDs that have vectors + int numVectors = docIdsWithVectors.length; + + float[][] normalizedVectors = new float[numVectors][]; + float[] magnitudes = new float[numVectors]; + + normalizedVectors[0] = new float[] { 0.6f, 0.8f, 0.0f, 0.0f }; // Doc 1 + magnitudes[0] = 5.0f; + + normalizedVectors[1] = new float[] { 1.0f, 0.0f, 0.0f, 0.0f }; // Doc 3 + magnitudes[1] = 2.0f; + + normalizedVectors[2] = new float[] { 0.0f, 0.0f, 0.6f, 0.8f }; // Doc 4 + magnitudes[2] = 10.0f; + + // Expected original vectors after denormalization + float[][] expectedVectors = new float[numVectors][]; + expectedVectors[0] = new float[] { 3.0f, 4.0f, 0.0f, 0.0f }; // Doc 1 + expectedVectors[1] = new float[] { 2.0f, 0.0f, 0.0f, 0.0f }; // Doc 3 + expectedVectors[2] = new float[] { 0.0f, 0.0f, 6.0f, 8.0f }; // Doc 4 + + // Create a custom FloatVectorValues that simulates post-merge sparse vector scenario + FloatVectorValues sparseVectorValues = new FloatVectorValues() { + @Override + public int dimension() { + return 4; + } + + @Override + public int size() { + return numVectors; + } + + @Override + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int index = -1; + + @Override + public int docID() { + return index; + } + + @Override + public int index() { + return index; + } + + @Override + public int nextDoc() { + return advance(index + 1); + } + + @Override + public int advance(int target) { + if (target >= numVectors) return NO_MORE_DOCS; + return index = target; + } + + @Override + public long cost() { + return numVectors; + } + }; + } + + @Override + public FloatVectorValues copy() { + throw new UnsupportedOperationException(); + } + + @Override + public VectorScorer scorer(float[] floats) { + throw new UnsupportedOperationException(); + } + + // This is the key method - it maps ordinals to actual document IDs + @Override + public int ordToDoc(int ord) { + // ord 0 -> docId 1, ord 1 -> docId 3, ord 2 -> docId 4 + return docIdsWithVectors[ord]; + } + + @Override + public float[] vectorValue(int ord) { + return normalizedVectors[ord]; + } + }; + + // Create magnitudes that correspond to the actual document IDs + NumericDocValues sparseMagnitudes = new NumericDocValues() { + private int docId = -1; + + @Override + public long longValue() { + // Find which vector index corresponds to this docId + for (int i = 0; i < docIdsWithVectors.length; i++) { + if (docIdsWithVectors[i] == docId) { + return Float.floatToRawIntBits(magnitudes[i]); + } + } + return Float.floatToRawIntBits(1.0f); // Default magnitude + } + + @Override + public boolean advanceExact(int target) { + docId = target; + // Check if this docId has a vector + for (int vectorDocId : docIdsWithVectors) { + if (vectorDocId == target) { + return true; + } + } + return false; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() { + return advance(docId + 1); + } + + @Override + public int advance(int target) { + for (int vectorDocId : docIdsWithVectors) { + if (vectorDocId >= target) { + docId = vectorDocId; + return docId; + } + } + return NO_MORE_DOCS; + } + + @Override + public long cost() { + return totalDocs; + } + }; + + // Test the fixed version (with ordToDoc) + DenormalizedCosineFloatVectorValues vectorValues = new DenormalizedCosineFloatVectorValues(sparseVectorValues, sparseMagnitudes); + + // Test that ordToDoc method properly maps ordinals to document IDs + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + + for (int ord = 0; ord < numVectors; ord++) { + iterator.advance(ord); + + // Verify that ordToDoc works correctly + int expectedDocId = docIdsWithVectors[ord]; + int actualDocId = vectorValues.ordToDoc(ord); + assertEquals("ordToDoc should correctly map ord " + ord + " to docId " + expectedDocId, expectedDocId, actualDocId); + + // Get the denormalized vector - this relies on ordToDoc working correctly + float[] actualVector = vectorValues.vectorValue(iterator.index()); + float actualMagnitude = vectorValues.magnitude(); + + // Verify the denormalized vector is correct + assertArrayEquals( + "Vector at ord " + ord + " (docId " + expectedDocId + ") should be correctly denormalized", + expectedVectors[ord], + actualVector, + 1e-6f + ); + + // Verify the magnitude is correct + assertEquals( + "Magnitude at ord " + ord + " (docId " + expectedDocId + ") should be correct", + magnitudes[ord], + actualMagnitude, + 1e-6f + ); + } + } + }