Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/137862.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 137862
summary: "[Vector Search] Fix wrong vector docvalue_fields"
area: Vector Search
type: bug
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -161,3 +161,77 @@ setup:
- close_to: { hits.hits.2.fields.vector2.0.4: { value: -100.0, error: 0.001 } }

- match: {hits.hits.2.fields.vector3.0: [-1, 100, -13, 15, -128]}

---
"Dense vector cosine docvalue_fields with sparse documents after force merge":
- requires:
cluster_features: [ "mapper.fix_dense_vector_wrong_fields" ]
reason: Support for dense vector doc value fields capability required
- do:
indices.create:
index: test_cosine
body:
mappings:
properties:
foo:
type: keyword
vec:
type: dense_vector
similarity: cosine

# Create sparse vector scenario - some docs have vectors, some don't
- do:
index:
index: test_cosine
id: "1"
body:
foo: "bar"

- do:
index:
index: test_cosine
id: "2"
body:
foo: "bar"

- do:
index:
index: test_cosine
id: "3"
body:
vec: [1, 2]

- do:
indices.refresh:
index: test_cosine

- do:
index:
index: test_cosine
id: "4"
body:
foo: "bar"

# Force merge to create the ord/docId mapping issue scenario
- do:
indices.forcemerge:
index: test_cosine
max_num_segments: 1

# Search with docvalue_fields to get the vector values
- do:
search:
index: test_cosine
body:
docvalue_fields: ["vec"]
query:
term:
_id: "3"

# Verify that _id=3's vector value is correctly returned as [1, 2]
# This test verifies that the ordToDoc fix works correctly
- match: { hits.total.value: 1 }
- match: { hits.hits.0._id: "3" }
- length: { hits.hits.0.fields.vec.0: 2 }
- close_to: { hits.hits.0.fields.vec.0.0: { value: 1.0, error: 0.001 } }
- close_to: { hits.hits.0.fields.vec.0.1: { value: 2.0, error: 0.001 } }
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ public class MapperFeatures implements FeatureSpecification {
static final NodeFeature SEARCH_LOAD_PER_SHARD = new NodeFeature("mapper.search_load_per_shard");
static final NodeFeature PATTERNED_TEXT = new NodeFeature("mapper.patterned_text");
public static final NodeFeature MULTI_FIELD_UNICODE_OPTIMISATION_FIX = new NodeFeature("mapper.multi_field.unicode_optimisation_fix");
public static final NodeFeature FIX_DENSE_VECTOR_WRONG_FIELDS = new NodeFeature("mapper.fix_dense_vector_wrong_fields");

@Override
public Set<NodeFeature> getTestFeatures() {
Expand Down Expand Up @@ -83,7 +84,8 @@ public Set<NodeFeature> getTestFeatures() {
SPARSE_VECTOR_INDEX_OPTIONS_FEATURE,
PATTERNED_TEXT,
MULTI_FIELD_UNICODE_OPTIMISATION_FIX,
MATCH_ONLY_TEXT_BLOCK_LOADER_FIX
MATCH_ONLY_TEXT_BLOCK_LOADER_FIX,
FIX_DENSE_VECTOR_WRONG_FIELDS
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ public VectorScorer scorer(float[] floats) throws IOException {
return in.scorer(floats);
}

@Override
public int ordToDoc(int ord) {
return in.ordToDoc(ord);
}

public float magnitude() {
return magnitude;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@

package org.elasticsearch.index.mapper.vectors;

import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.KnnVectorValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.VectorScorer;
import org.elasticsearch.test.ESTestCase;

import java.io.IOException;
Expand Down Expand Up @@ -114,4 +116,197 @@ public long cost() {
};
}

public void testOrdToDocWithSparseVectors() throws IOException {
// This test simulates a real-world scenario where some documents don't have vector fields
// After force merge, the ord (ordinal) and docId mapping becomes crucial

// Simulate a scenario where we have 6 documents, but only documents 1, 3, 4 have vectors
// Doc 0: no vector
// Doc 1: vector [0.6, 0.8, 0.0, 0.0], magnitude 5.0 -> original [3.0, 4.0, 0.0, 0.0]
// Doc 2: no vector
// Doc 3: vector [1.0, 0.0, 0.0, 0.0], magnitude 2.0 -> original [2.0, 0.0, 0.0, 0.0]
// Doc 4: vector [0.0, 0.0, 0.6, 0.8], magnitude 10.0 -> original [0.0, 0.0, 6.0, 8.0]
// Doc 5: no vector

// After merge, the vector ordinals will be 0, 1, 2 but they correspond to docIds 1, 3, 4
int totalDocs = 6;
int[] docIdsWithVectors = { 1, 3, 4 }; // Document IDs that have vectors
int numVectors = docIdsWithVectors.length;

float[][] normalizedVectors = new float[numVectors][];
float[] magnitudes = new float[numVectors];

normalizedVectors[0] = new float[] { 0.6f, 0.8f, 0.0f, 0.0f }; // Doc 1
magnitudes[0] = 5.0f;

normalizedVectors[1] = new float[] { 1.0f, 0.0f, 0.0f, 0.0f }; // Doc 3
magnitudes[1] = 2.0f;

normalizedVectors[2] = new float[] { 0.0f, 0.0f, 0.6f, 0.8f }; // Doc 4
magnitudes[2] = 10.0f;

// Expected original vectors after denormalization
float[][] expectedVectors = new float[numVectors][];
expectedVectors[0] = new float[] { 3.0f, 4.0f, 0.0f, 0.0f }; // Doc 1
expectedVectors[1] = new float[] { 2.0f, 0.0f, 0.0f, 0.0f }; // Doc 3
expectedVectors[2] = new float[] { 0.0f, 0.0f, 6.0f, 8.0f }; // Doc 4

// Create a custom FloatVectorValues that simulates post-merge sparse vector scenario
FloatVectorValues sparseVectorValues = new FloatVectorValues() {
@Override
public int dimension() {
return 4;
}

@Override
public int size() {
return numVectors;
}

@Override
public DocIndexIterator iterator() {
return new DocIndexIterator() {
private int index = -1;

@Override
public int docID() {
return index;
}

@Override
public int index() {
return index;
}

@Override
public int nextDoc() {
return advance(index + 1);
}

@Override
public int advance(int target) {
if (target >= numVectors) return NO_MORE_DOCS;
return index = target;
}

@Override
public long cost() {
return numVectors;
}
};
}

@Override
public FloatVectorValues copy() {
throw new UnsupportedOperationException();
}

@Override
public VectorScorer scorer(float[] floats) {
throw new UnsupportedOperationException();
}

// This is the key method - it maps ordinals to actual document IDs
@Override
public int ordToDoc(int ord) {
// ord 0 -> docId 1, ord 1 -> docId 3, ord 2 -> docId 4
return docIdsWithVectors[ord];
}

@Override
public float[] vectorValue(int ord) {
return normalizedVectors[ord];
}
};

// Create magnitudes that correspond to the actual document IDs
NumericDocValues sparseMagnitudes = new NumericDocValues() {
private int docId = -1;

@Override
public long longValue() {
// Find which vector index corresponds to this docId
for (int i = 0; i < docIdsWithVectors.length; i++) {
if (docIdsWithVectors[i] == docId) {
return Float.floatToRawIntBits(magnitudes[i]);
}
}
return Float.floatToRawIntBits(1.0f); // Default magnitude
}

@Override
public boolean advanceExact(int target) {
docId = target;
// Check if this docId has a vector
for (int vectorDocId : docIdsWithVectors) {
if (vectorDocId == target) {
return true;
}
}
return false;
}

@Override
public int docID() {
return docId;
}

@Override
public int nextDoc() {
return advance(docId + 1);
}

@Override
public int advance(int target) {
for (int vectorDocId : docIdsWithVectors) {
if (vectorDocId >= target) {
docId = vectorDocId;
return docId;
}
}
return NO_MORE_DOCS;
}

@Override
public long cost() {
return totalDocs;
}
};

// Test the fixed version (with ordToDoc)
DenormalizedCosineFloatVectorValues vectorValues = new DenormalizedCosineFloatVectorValues(sparseVectorValues, sparseMagnitudes);

// Test that ordToDoc method properly maps ordinals to document IDs
KnnVectorValues.DocIndexIterator iterator = vectorValues.iterator();

for (int ord = 0; ord < numVectors; ord++) {
iterator.advance(ord);

// Verify that ordToDoc works correctly
int expectedDocId = docIdsWithVectors[ord];
int actualDocId = vectorValues.ordToDoc(ord);
assertEquals("ordToDoc should correctly map ord " + ord + " to docId " + expectedDocId, expectedDocId, actualDocId);

// Get the denormalized vector - this relies on ordToDoc working correctly
float[] actualVector = vectorValues.vectorValue(iterator.index());
float actualMagnitude = vectorValues.magnitude();

// Verify the denormalized vector is correct
assertArrayEquals(
"Vector at ord " + ord + " (docId " + expectedDocId + ") should be correctly denormalized",
expectedVectors[ord],
actualVector,
1e-6f
);

// Verify the magnitude is correct
assertEquals(
"Magnitude at ord " + ord + " (docId " + expectedDocId + ") should be correct",
magnitudes[ord],
actualMagnitude,
1e-6f
);
}
}

}