From 51874eb0ba2d26e71d752944cc15689f6cb657f8 Mon Sep 17 00:00:00 2001 From: weizijun Date: Wed, 19 Nov 2025 23:57:29 +0800 Subject: [PATCH 1/2] [9.1][Vector Search] BugFix wrong vector docvalue_fields --- docs/changelog/137862.yaml | 5 + .../200_dense_vector_docvalue_fields.yml | 174 ++++++++++++++++ .../index/mapper/MapperFeatures.java | 4 +- .../DenormalizedCosineFloatVectorValues.java | 5 + ...ormalizedCosineFloatVectorValuesTests.java | 195 ++++++++++++++++++ 5 files changed, 382 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/137862.yaml diff --git a/docs/changelog/137862.yaml b/docs/changelog/137862.yaml new file mode 100644 index 0000000000000..10de80390ecc6 --- /dev/null +++ b/docs/changelog/137862.yaml @@ -0,0 +1,5 @@ +pr: 137862 +summary: "[Vector Search] Fix wrong vector docvalue_fields" +area: Vector Search +type: bug +issues: [] diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml index 161fc23a84651..7a98f35005137 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml @@ -161,3 +161,177 @@ setup: - close_to: { hits.hits.2.fields.vector2.0.4: { value: -100.0, error: 0.001 } } - match: {hits.hits.2.fields.vector3.0: [-1, 100, -13, 15, -128]} + +--- +"Dense vector cosine docvalue_fields with sparse documents after force merge": + - requires: + cluster_features: [ "mapper.fix_dense_vector_wrong_fields" ] + reason: Support for dense vector doc value fields capability required + - do: + indices.create: + index: test_cosine + body: + mappings: + properties: + foo: + type: keyword + vec: + type: dense_vector + similarity: cosine + + # Create sparse vector scenario - some docs have vectors, some don't + - do: + index: + index: test_cosine + id: "1" + body: + foo: "bar" + + - do: + index: + index: test_cosine + id: "2" + body: + foo: "bar" + + - do: + index: + index: test_cosine + id: "3" + body: + vec: [1, 2] + + - do: + indices.refresh: + index: test_cosine + + - do: + index: + index: test_cosine + id: "4" + body: + foo: "bar" + + # Force merge to create the ord/docId mapping issue scenario + - do: + indices.forcemerge: + index: test_cosine + max_num_segments: 1 + + # Search with docvalue_fields to get the vector values + - do: + search: + index: test_cosine + body: + docvalue_fields: ["vec"] + query: + term: + _id: "3" + + # Verify that _id=3's vector value is correctly returned as [1, 2] + # This test verifies that the ordToDoc fix works correctly + - match: { hits.total.value: 1 } + - match: { hits.hits.0._id: "3" } + - length: { hits.hits.0.fields.vec.0: 2 } + - close_to: { hits.hits.0.fields.vec.0.0: { value: 1.0, error: 0.001 } } + - close_to: { hits.hits.0.fields.vec.0.1: { value: 2.0, error: 0.001 } } + +--- +"dense_vector docvalues with bfloat16": + - requires: + cluster_features: [ "mapper.vectors.generic_vector_format" ] + reason: Needs generic vector support + - do: + indices.create: + index: test-bfloat16 + body: + mappings: + properties: + name: + type: keyword + vector7: + type: dense_vector + element_type: bfloat16 + dims: 5 + index: true + vector8: + type: dense_vector + element_type: bfloat16 + dims: 5 + index: false + + - do: + index: + index: test-bfloat16 + id: "1" + body: + name: cow.jpg + vector7: [230.0, 300.33, -34.8988, 15.555, -200.0] + vector8: [130.0, 115.0, -1.02, 15.555, -100.0] + - do: + index: + index: test-bfloat16 + id: "2" + body: + name: moose.jpg + vector7: [-0.5, 100.0, -13, 14.8, -156.0] + - do: + index: + index: test-bfloat16 + id: "3" + body: + name: rabbit.jpg + vector8: [130.0, 115.0, -1.02, 15.555, -100.0] + + - do: + indices.refresh: {} + + - do: + search: + _source: false + index: test-bfloat16 + body: + docvalue_fields: [name, vector7, vector8] + sort: name + + - match: {hits.hits.0._id: "1"} + - match: {hits.hits.0.fields.name.0: "cow.jpg"} + + - length: {hits.hits.0.fields.vector7.0: 5} + - length: {hits.hits.0.fields.vector8.0: 5} + + - close_to: { hits.hits.0.fields.vector7.0.0: { value: 230.0, error: 0.1 } } + - close_to: { hits.hits.0.fields.vector7.0.1: { value: 300.33, error: 0.1 } } + - close_to: { hits.hits.0.fields.vector7.0.2: { value: -34.8988, error: 0.1 } } + - close_to: { hits.hits.0.fields.vector7.0.3: { value: 15.555, error: 0.1 } } + - close_to: { hits.hits.0.fields.vector7.0.4: { value: -200.0, error: 0.1 } } + + - close_to: { hits.hits.0.fields.vector8.0.0: { value: 130.0, error: 0.1 } } + - close_to: { hits.hits.0.fields.vector8.0.1: { value: 115.0, error: 0.1 } } + - close_to: { hits.hits.0.fields.vector8.0.2: { value: -1.02, error: 0.1 } } + - close_to: { hits.hits.0.fields.vector8.0.3: { value: 15.555, error: 0.1 } } + - close_to: { hits.hits.0.fields.vector8.0.4: { value: -100.0, error: 0.1 } } + + - match: {hits.hits.1._id: "2"} + - match: {hits.hits.1.fields.name.0: "moose.jpg"} + + - length: {hits.hits.1.fields.vector7.0: 5} + - match: {hits.hits.1.fields.vector8: null} + + - close_to: { hits.hits.1.fields.vector7.0.0: { value: -0.5, error: 0.1 } } + - close_to: { hits.hits.1.fields.vector7.0.1: { value: 100.0, error: 0.1 } } + - close_to: { hits.hits.1.fields.vector7.0.2: { value: -13, error: 0.1 } } + - close_to: { hits.hits.1.fields.vector7.0.3: { value: 14.8, error: 0.1 } } + - close_to: { hits.hits.1.fields.vector7.0.4: { value: -156.0, error: 0.1 } } + + - match: {hits.hits.2._id: "3"} + - match: {hits.hits.2.fields.name.0: "rabbit.jpg"} + + - length: {hits.hits.2.fields.vector8.0: 5} + - match: {hits.hits.2.fields.vector7: null} + + - close_to: { hits.hits.2.fields.vector8.0.0: { value: 130.0, error: 0.1 } } + - close_to: { hits.hits.2.fields.vector8.0.1: { value: 115.0, error: 0.1 } } + - close_to: { hits.hits.2.fields.vector8.0.2: { value: -1.02, error: 0.1 } } + - close_to: { hits.hits.2.fields.vector8.0.3: { value: 15.555, error: 0.1 } } + - close_to: { hits.hits.2.fields.vector8.0.4: { value: -100.0, error: 0.1 } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java index ca344c134783b..cc318a122f76b 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MapperFeatures.java @@ -49,6 +49,7 @@ public class MapperFeatures implements FeatureSpecification { static final NodeFeature SEARCH_LOAD_PER_SHARD = new NodeFeature("mapper.search_load_per_shard"); static final NodeFeature PATTERNED_TEXT = new NodeFeature("mapper.patterned_text"); public static final NodeFeature MULTI_FIELD_UNICODE_OPTIMISATION_FIX = new NodeFeature("mapper.multi_field.unicode_optimisation_fix"); + public static final NodeFeature FIX_DENSE_VECTOR_WRONG_FIELDS = new NodeFeature("mapper.fix_dense_vector_wrong_fields"); @Override public Set getTestFeatures() { @@ -83,7 +84,8 @@ public Set getTestFeatures() { SPARSE_VECTOR_INDEX_OPTIONS_FEATURE, PATTERNED_TEXT, MULTI_FIELD_UNICODE_OPTIMISATION_FIX, - MATCH_ONLY_TEXT_BLOCK_LOADER_FIX + MATCH_ONLY_TEXT_BLOCK_LOADER_FIX, + FIX_DENSE_VECTOR_WRONG_FIELDS ); } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java index 04069333deb13..775168e69e477 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValues.java @@ -59,6 +59,11 @@ public VectorScorer scorer(float[] floats) throws IOException { return in.scorer(floats); } + @Override + public int ordToDoc(int ord) { + return in.ordToDoc(ord); + } + public float magnitude() { return magnitude; } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java index de4ab0bc5df30..12df0bfaecaf9 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/DenormalizedCosineFloatVectorValuesTests.java @@ -9,8 +9,10 @@ package org.elasticsearch.index.mapper.vectors; +import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.search.VectorScorer; import org.elasticsearch.test.ESTestCase; import java.io.IOException; @@ -114,4 +116,197 @@ public long cost() { }; } + public void testOrdToDocWithSparseVectors() throws IOException { + // This test simulates a real-world scenario where some documents don't have vector fields + // After force merge, the ord (ordinal) and docId mapping becomes crucial + + // Simulate a scenario where we have 6 documents, but only documents 1, 3, 4 have vectors + // Doc 0: no vector + // Doc 1: vector [0.6, 0.8, 0.0, 0.0], magnitude 5.0 -> original [3.0, 4.0, 0.0, 0.0] + // Doc 2: no vector + // Doc 3: vector [1.0, 0.0, 0.0, 0.0], magnitude 2.0 -> original [2.0, 0.0, 0.0, 0.0] + // Doc 4: vector [0.0, 0.0, 0.6, 0.8], magnitude 10.0 -> original [0.0, 0.0, 6.0, 8.0] + // Doc 5: no vector + + // After merge, the vector ordinals will be 0, 1, 2 but they correspond to docIds 1, 3, 4 + int totalDocs = 6; + int[] docIdsWithVectors = { 1, 3, 4 }; // Document IDs that have vectors + int numVectors = docIdsWithVectors.length; + + float[][] normalizedVectors = new float[numVectors][]; + float[] magnitudes = new float[numVectors]; + + normalizedVectors[0] = new float[] { 0.6f, 0.8f, 0.0f, 0.0f }; // Doc 1 + magnitudes[0] = 5.0f; + + normalizedVectors[1] = new float[] { 1.0f, 0.0f, 0.0f, 0.0f }; // Doc 3 + magnitudes[1] = 2.0f; + + normalizedVectors[2] = new float[] { 0.0f, 0.0f, 0.6f, 0.8f }; // Doc 4 + magnitudes[2] = 10.0f; + + // Expected original vectors after denormalization + float[][] expectedVectors = new float[numVectors][]; + expectedVectors[0] = new float[] { 3.0f, 4.0f, 0.0f, 0.0f }; // Doc 1 + expectedVectors[1] = new float[] { 2.0f, 0.0f, 0.0f, 0.0f }; // Doc 3 + expectedVectors[2] = new float[] { 0.0f, 0.0f, 6.0f, 8.0f }; // Doc 4 + + // Create a custom FloatVectorValues that simulates post-merge sparse vector scenario + FloatVectorValues sparseVectorValues = new FloatVectorValues() { + @Override + public int dimension() { + return 4; + } + + @Override + public int size() { + return numVectors; + } + + @Override + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int index = -1; + + @Override + public int docID() { + return index; + } + + @Override + public int index() { + return index; + } + + @Override + public int nextDoc() { + return advance(index + 1); + } + + @Override + public int advance(int target) { + if (target >= numVectors) return NO_MORE_DOCS; + return index = target; + } + + @Override + public long cost() { + return numVectors; + } + }; + } + + @Override + public FloatVectorValues copy() { + throw new UnsupportedOperationException(); + } + + @Override + public VectorScorer scorer(float[] floats) { + throw new UnsupportedOperationException(); + } + + // This is the key method - it maps ordinals to actual document IDs + @Override + public int ordToDoc(int ord) { + // ord 0 -> docId 1, ord 1 -> docId 3, ord 2 -> docId 4 + return docIdsWithVectors[ord]; + } + + @Override + public float[] vectorValue(int ord) { + return normalizedVectors[ord]; + } + }; + + // Create magnitudes that correspond to the actual document IDs + NumericDocValues sparseMagnitudes = new NumericDocValues() { + private int docId = -1; + + @Override + public long longValue() { + // Find which vector index corresponds to this docId + for (int i = 0; i < docIdsWithVectors.length; i++) { + if (docIdsWithVectors[i] == docId) { + return Float.floatToRawIntBits(magnitudes[i]); + } + } + return Float.floatToRawIntBits(1.0f); // Default magnitude + } + + @Override + public boolean advanceExact(int target) { + docId = target; + // Check if this docId has a vector + for (int vectorDocId : docIdsWithVectors) { + if (vectorDocId == target) { + return true; + } + } + return false; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() { + return advance(docId + 1); + } + + @Override + public int advance(int target) { + for (int vectorDocId : docIdsWithVectors) { + if (vectorDocId >= target) { + docId = vectorDocId; + return docId; + } + } + return NO_MORE_DOCS; + } + + @Override + public long cost() { + return totalDocs; + } + }; + + // Test the fixed version (with ordToDoc) + DenormalizedCosineFloatVectorValues vectorValues = new DenormalizedCosineFloatVectorValues(sparseVectorValues, sparseMagnitudes); + + // Test that ordToDoc method properly maps ordinals to document IDs + KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator(); + + for (int ord = 0; ord < numVectors; ord++) { + iterator.advance(ord); + + // Verify that ordToDoc works correctly + int expectedDocId = docIdsWithVectors[ord]; + int actualDocId = vectorValues.ordToDoc(ord); + assertEquals("ordToDoc should correctly map ord " + ord + " to docId " + expectedDocId, expectedDocId, actualDocId); + + // Get the denormalized vector - this relies on ordToDoc working correctly + float[] actualVector = vectorValues.vectorValue(iterator.index()); + float actualMagnitude = vectorValues.magnitude(); + + // Verify the denormalized vector is correct + assertArrayEquals( + "Vector at ord " + ord + " (docId " + expectedDocId + ") should be correctly denormalized", + expectedVectors[ord], + actualVector, + 1e-6f + ); + + // Verify the magnitude is correct + assertEquals( + "Magnitude at ord " + ord + " (docId " + expectedDocId + ") should be correct", + magnitudes[ord], + actualMagnitude, + 1e-6f + ); + } + } + } From 6f7101c89826a82e779bf63089eff72da2596d94 Mon Sep 17 00:00:00 2001 From: Ignacio Vera Date: Thu, 20 Nov 2025 08:58:12 +0100 Subject: [PATCH 2/2] fix backport --- .../200_dense_vector_docvalue_fields.yml | 100 ------------------ 1 file changed, 100 deletions(-) diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml index 7a98f35005137..42e58061b8c9b 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/200_dense_vector_docvalue_fields.yml @@ -235,103 +235,3 @@ setup: - length: { hits.hits.0.fields.vec.0: 2 } - close_to: { hits.hits.0.fields.vec.0.0: { value: 1.0, error: 0.001 } } - close_to: { hits.hits.0.fields.vec.0.1: { value: 2.0, error: 0.001 } } - ---- -"dense_vector docvalues with bfloat16": - - requires: - cluster_features: [ "mapper.vectors.generic_vector_format" ] - reason: Needs generic vector support - - do: - indices.create: - index: test-bfloat16 - body: - mappings: - properties: - name: - type: keyword - vector7: - type: dense_vector - element_type: bfloat16 - dims: 5 - index: true - vector8: - type: dense_vector - element_type: bfloat16 - dims: 5 - index: false - - - do: - index: - index: test-bfloat16 - id: "1" - body: - name: cow.jpg - vector7: [230.0, 300.33, -34.8988, 15.555, -200.0] - vector8: [130.0, 115.0, -1.02, 15.555, -100.0] - - do: - index: - index: test-bfloat16 - id: "2" - body: - name: moose.jpg - vector7: [-0.5, 100.0, -13, 14.8, -156.0] - - do: - index: - index: test-bfloat16 - id: "3" - body: - name: rabbit.jpg - vector8: [130.0, 115.0, -1.02, 15.555, -100.0] - - - do: - indices.refresh: {} - - - do: - search: - _source: false - index: test-bfloat16 - body: - docvalue_fields: [name, vector7, vector8] - sort: name - - - match: {hits.hits.0._id: "1"} - - match: {hits.hits.0.fields.name.0: "cow.jpg"} - - - length: {hits.hits.0.fields.vector7.0: 5} - - length: {hits.hits.0.fields.vector8.0: 5} - - - close_to: { hits.hits.0.fields.vector7.0.0: { value: 230.0, error: 0.1 } } - - close_to: { hits.hits.0.fields.vector7.0.1: { value: 300.33, error: 0.1 } } - - close_to: { hits.hits.0.fields.vector7.0.2: { value: -34.8988, error: 0.1 } } - - close_to: { hits.hits.0.fields.vector7.0.3: { value: 15.555, error: 0.1 } } - - close_to: { hits.hits.0.fields.vector7.0.4: { value: -200.0, error: 0.1 } } - - - close_to: { hits.hits.0.fields.vector8.0.0: { value: 130.0, error: 0.1 } } - - close_to: { hits.hits.0.fields.vector8.0.1: { value: 115.0, error: 0.1 } } - - close_to: { hits.hits.0.fields.vector8.0.2: { value: -1.02, error: 0.1 } } - - close_to: { hits.hits.0.fields.vector8.0.3: { value: 15.555, error: 0.1 } } - - close_to: { hits.hits.0.fields.vector8.0.4: { value: -100.0, error: 0.1 } } - - - match: {hits.hits.1._id: "2"} - - match: {hits.hits.1.fields.name.0: "moose.jpg"} - - - length: {hits.hits.1.fields.vector7.0: 5} - - match: {hits.hits.1.fields.vector8: null} - - - close_to: { hits.hits.1.fields.vector7.0.0: { value: -0.5, error: 0.1 } } - - close_to: { hits.hits.1.fields.vector7.0.1: { value: 100.0, error: 0.1 } } - - close_to: { hits.hits.1.fields.vector7.0.2: { value: -13, error: 0.1 } } - - close_to: { hits.hits.1.fields.vector7.0.3: { value: 14.8, error: 0.1 } } - - close_to: { hits.hits.1.fields.vector7.0.4: { value: -156.0, error: 0.1 } } - - - match: {hits.hits.2._id: "3"} - - match: {hits.hits.2.fields.name.0: "rabbit.jpg"} - - - length: {hits.hits.2.fields.vector8.0: 5} - - match: {hits.hits.2.fields.vector7: null} - - - close_to: { hits.hits.2.fields.vector8.0.0: { value: 130.0, error: 0.1 } } - - close_to: { hits.hits.2.fields.vector8.0.1: { value: 115.0, error: 0.1 } } - - close_to: { hits.hits.2.fields.vector8.0.2: { value: -1.02, error: 0.1 } } - - close_to: { hits.hits.2.fields.vector8.0.3: { value: 15.555, error: 0.1 } } - - close_to: { hits.hits.2.fields.vector8.0.4: { value: -100.0, error: 0.1 } }