From 021ae419bd77e8e16a36b8db42549d11b0073bbb Mon Sep 17 00:00:00 2001 From: Benjamin Trent Date: Tue, 17 Jun 2025 17:39:19 -0400 Subject: [PATCH 1/2] Fix NPE in flat_bbq scorer when all vectors are missing (#129548) It is possible to get all the way down to the knn format reader and there be no vectors in the index. This execution path is possible if utilizing nested queries (which bypasses the higher level checks in `KnnFloatVectorQuery#approximateSearch`). bbq_flat should check for the existence of vectors before attempting to create the scorer. (cherry picked from commit 80667d0c8a80c88e900cac6f2006f0471ee6f234) --- docs/changelog/129548.yaml | 5 ++ .../es816/ES816BinaryFlatVectorsScorer.java | 3 + .../ES816BinaryQuantizedVectorsReader.java | 2 +- .../es818/ES818BinaryFlatVectorsScorer.java | 3 + .../ES818BinaryQuantizedVectorsReader.java | 2 +- ...S816BinaryQuantizedVectorsFormatTests.java | 65 +++++++++++++++++++ ...S818BinaryQuantizedVectorsFormatTests.java | 65 +++++++++++++++++++ 7 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 docs/changelog/129548.yaml diff --git a/docs/changelog/129548.yaml b/docs/changelog/129548.yaml new file mode 100644 index 0000000000000..cb5b95faef1a7 --- /dev/null +++ b/docs/changelog/129548.yaml @@ -0,0 +1,5 @@ +pr: 129548 +summary: Fix NPE in `flat_bbq` scorer when all vectors are missing +area: Vector Search +type: bug +issues: [] diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryFlatVectorsScorer.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryFlatVectorsScorer.java index e85079e998c61..daea6358ff3d5 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryFlatVectorsScorer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryFlatVectorsScorer.java @@ -59,6 +59,9 @@ public RandomVectorScorer getRandomVectorScorer( float[] target ) throws IOException { if (vectorValues instanceof BinarizedByteVectorValues binarizedVectors) { + assert binarizedVectors.getQuantizer() != null + : "BinarizedByteVectorValues must have a quantizer for ES816BinaryFlatVectorsScorer"; + assert binarizedVectors.size() > 0 : "BinarizedByteVectorValues must have at least one vector for ES816BinaryFlatVectorsScorer"; BinaryQuantizer quantizer = binarizedVectors.getQuantizer(); float[] centroid = binarizedVectors.getCentroid(); // FIXME: precompute this once? diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java index fc20809ea7eed..8e0d9141d5771 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsReader.java @@ -155,7 +155,7 @@ static void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { @Override public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException { FieldEntry fi = fields.get(field); - if (fi == null) { + if (fi == null || fi.size() == 0) { return null; } return vectorScorer.getRandomVectorScorer( diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryFlatVectorsScorer.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryFlatVectorsScorer.java index 7c7e470909eb3..b1af8f3f43d88 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryFlatVectorsScorer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryFlatVectorsScorer.java @@ -64,6 +64,9 @@ public RandomVectorScorer getRandomVectorScorer( float[] target ) throws IOException { if (vectorValues instanceof BinarizedByteVectorValues binarizedVectors) { + assert binarizedVectors.getQuantizer() != null + : "BinarizedByteVectorValues must have a quantizer for ES816BinaryFlatVectorsScorer"; + assert binarizedVectors.size() > 0 : "BinarizedByteVectorValues must have at least one vector for ES816BinaryFlatVectorsScorer"; OptimizedScalarQuantizer quantizer = binarizedVectors.getQuantizer(); float[] centroid = binarizedVectors.getCentroid(); // We make a copy as the quantization process mutates the input diff --git a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java index 8036b8314cdc1..633d0ed0b1571 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java +++ b/server/src/main/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsReader.java @@ -154,7 +154,7 @@ static void validateFieldEntry(FieldInfo info, FieldEntry fieldEntry) { @Override public RandomVectorScorer getRandomVectorScorer(String field, float[] target) throws IOException { FieldEntry fi = fields.get(field); - if (fi == null) { + if (fi == null || fi.size() == 0) { return null; } return vectorScorer.getRandomVectorScorer( diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java index e11775e2cdedb..9f30a728b0410 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FloatVectorValues; @@ -32,18 +33,30 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.SoftDeletesRetentionMergePolicy; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +import org.apache.lucene.search.join.BitSetProducer; +import org.apache.lucene.search.join.CheckJoinIndex; +import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; +import org.apache.lucene.search.join.QueryBitSetProducer; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.index.codec.vectors.BQVectorUtils; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Locale; import static java.lang.String.format; @@ -68,6 +81,58 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { }; } + static String encodeInts(int[] i) { + return Arrays.toString(i); + } + + static BitSetProducer parentFilter(IndexReader r) throws IOException { + // Create a filter that defines "parent" documents in the index + BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); + CheckJoinIndex.check(r, parentsFilter); + return parentsFilter; + } + + Document makeParent(int[] children) { + Document parent = new Document(); + parent.add(newStringField("docType", "_parent", Field.Store.NO)); + parent.add(newStringField("id", encodeInts(children), Field.Store.YES)); + return parent; + } + + public void testEmptyDiversifiedChildSearch() throws Exception { + String fieldName = "field"; + int dims = random().nextInt(4, 65); + float[] vector = randomVector(dims); + VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.EUCLIDEAN; + try (Directory d = newDirectory()) { + IndexWriterConfig iwc = newIndexWriterConfig().setCodec(codec); + iwc.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", MatchAllDocsQuery::new, iwc.getMergePolicy())); + try (IndexWriter w = new IndexWriter(d, iwc)) { + List toAdd = new ArrayList<>(); + for (int j = 1; j <= 5; j++) { + Document doc = new Document(); + doc.add(new KnnFloatVectorField(fieldName, vector, similarityFunction)); + doc.add(newStringField("id", Integer.toString(j), Field.Store.YES)); + toAdd.add(doc); + } + toAdd.add(makeParent(new int[] { 1, 2, 3, 4, 5 })); + w.addDocuments(toAdd); + w.addDocuments(List.of(makeParent(new int[] { 6, 7, 8, 9, 10 }))); + w.deleteDocuments(new FieldExistsQuery(fieldName), new TermQuery(new Term("id", encodeInts(new int[] { 1, 2, 3, 4, 5 })))); + w.flush(); + w.commit(); + w.forceMerge(1); + try (IndexReader reader = DirectoryReader.open(w)) { + IndexSearcher searcher = new IndexSearcher(reader); + BitSetProducer parentFilter = parentFilter(searcher.getIndexReader()); + Query query = new DiversifyingChildrenFloatKnnVectorQuery(fieldName, vector, null, 1, parentFilter); + assertTrue(searcher.search(query, 1).scoreDocs.length == 0); + } + } + + } + } + public void testSearch() throws Exception { String fieldName = "field"; int numVectors = random().nextInt(99, 500); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java index 6b8b64b235252..8fc905510df88 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.lucene101.Lucene101Codec; import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.KnnFloatVectorField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.FloatVectorValues; @@ -32,18 +33,30 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.KnnVectorValues; import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.SoftDeletesRetentionMergePolicy; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnFloatVectorQuery; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +import org.apache.lucene.search.join.BitSetProducer; +import org.apache.lucene.search.join.CheckJoinIndex; +import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; +import org.apache.lucene.search.join.QueryBitSetProducer; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.index.BaseKnnVectorsFormatTestCase; import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.index.codec.vectors.BQVectorUtils; import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; import java.util.Locale; import static java.lang.String.format; @@ -68,6 +81,58 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { }; } + static String encodeInts(int[] i) { + return Arrays.toString(i); + } + + static BitSetProducer parentFilter(IndexReader r) throws IOException { + // Create a filter that defines "parent" documents in the index + BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term("docType", "_parent"))); + CheckJoinIndex.check(r, parentsFilter); + return parentsFilter; + } + + Document makeParent(int[] children) { + Document parent = new Document(); + parent.add(newStringField("docType", "_parent", Field.Store.NO)); + parent.add(newStringField("id", encodeInts(children), Field.Store.YES)); + return parent; + } + + public void testEmptyDiversifiedChildSearch() throws Exception { + String fieldName = "field"; + int dims = random().nextInt(4, 65); + float[] vector = randomVector(dims); + VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.EUCLIDEAN; + try (Directory d = newDirectory()) { + IndexWriterConfig iwc = newIndexWriterConfig().setCodec(codec); + iwc.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", MatchAllDocsQuery::new, iwc.getMergePolicy())); + try (IndexWriter w = new IndexWriter(d, iwc)) { + List toAdd = new ArrayList<>(); + for (int j = 1; j <= 5; j++) { + Document doc = new Document(); + doc.add(new KnnFloatVectorField(fieldName, vector, similarityFunction)); + doc.add(newStringField("id", Integer.toString(j), Field.Store.YES)); + toAdd.add(doc); + } + toAdd.add(makeParent(new int[] { 1, 2, 3, 4, 5 })); + w.addDocuments(toAdd); + w.addDocuments(List.of(makeParent(new int[] { 6, 7, 8, 9, 10 }))); + w.deleteDocuments(new FieldExistsQuery(fieldName), new TermQuery(new Term("id", encodeInts(new int[] { 1, 2, 3, 4, 5 })))); + w.flush(); + w.commit(); + w.forceMerge(1); + try (IndexReader reader = DirectoryReader.open(w)) { + IndexSearcher searcher = new IndexSearcher(reader); + BitSetProducer parentFilter = parentFilter(searcher.getIndexReader()); + Query query = new DiversifyingChildrenFloatKnnVectorQuery(fieldName, vector, null, 1, parentFilter); + assertTrue(searcher.search(query, 1).scoreDocs.length == 0); + } + } + + } + } + public void testSearch() throws Exception { String fieldName = "field"; int numVectors = random().nextInt(99, 500); From bb4b88454839ea07b7e8f95233abbd574d6f53b6 Mon Sep 17 00:00:00 2001 From: Benjamin Trent <4357155+benwtrent@users.noreply.github.com> Date: Wed, 18 Jun 2025 09:05:40 -0400 Subject: [PATCH 2/2] fixing comp --- .../vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java | 2 +- .../vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java index 9f30a728b0410..7fada82441847 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es816/ES816BinaryQuantizedVectorsFormatTests.java @@ -105,7 +105,7 @@ public void testEmptyDiversifiedChildSearch() throws Exception { float[] vector = randomVector(dims); VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.EUCLIDEAN; try (Directory d = newDirectory()) { - IndexWriterConfig iwc = newIndexWriterConfig().setCodec(codec); + IndexWriterConfig iwc = newIndexWriterConfig().setCodec(getCodec()); iwc.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", MatchAllDocsQuery::new, iwc.getMergePolicy())); try (IndexWriter w = new IndexWriter(d, iwc)) { List toAdd = new ArrayList<>(); diff --git a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java index 8fc905510df88..81ff3e9f60f8c 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/vectors/es818/ES818BinaryQuantizedVectorsFormatTests.java @@ -105,7 +105,7 @@ public void testEmptyDiversifiedChildSearch() throws Exception { float[] vector = randomVector(dims); VectorSimilarityFunction similarityFunction = VectorSimilarityFunction.EUCLIDEAN; try (Directory d = newDirectory()) { - IndexWriterConfig iwc = newIndexWriterConfig().setCodec(codec); + IndexWriterConfig iwc = newIndexWriterConfig().setCodec(getCodec()); iwc.setMergePolicy(new SoftDeletesRetentionMergePolicy("soft_delete", MatchAllDocsQuery::new, iwc.getMergePolicy())); try (IndexWriter w = new IndexWriter(d, iwc)) { List toAdd = new ArrayList<>();