From 847fe512b879c57b0c4ce3f1ebf3dacc6d46a6e8 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Tue, 2 Dec 2025 08:56:43 +0100 Subject: [PATCH] Revert "Store keyword fields that trip ignore_above in binary doc values (#137483)" This reverts commit 67831350281b25dd18c596e099a08147267f3851. --- .../extras/MatchOnlyTextFieldMapper.java | 110 +++++------------- .../test/match_only_text/10_basic.yml | 4 +- ...aryDocValuesSyntheticFieldLoaderLayer.java | 84 ------------- .../index/mapper/KeywordFieldMapper.java | 67 ++--------- ...eticSourceNativeArrayIntegrationTests.java | 12 +- .../NativeArrayIntegrationTestCase.java | 14 +-- .../wildcard/mapper/WildcardFieldMapper.java | 56 ++++++++- 7 files changed, 97 insertions(+), 250 deletions(-) delete mode 100644 server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java index 54842bc84693b..a9edfc456db24 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java @@ -14,8 +14,6 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.DocValues; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; @@ -32,7 +30,6 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IOFunction; import org.elasticsearch.common.CheckedIntFunction; -import org.elasticsearch.common.io.stream.ByteArrayStreamInput; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.text.UTF8DecodingReader; import org.elasticsearch.common.unit.Fuzziness; @@ -42,7 +39,6 @@ import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.fielddata.FieldDataContext; import org.elasticsearch.index.fielddata.IndexFieldData; -import org.elasticsearch.index.fielddata.SortedBinaryDocValues; import org.elasticsearch.index.fielddata.SourceValueFetcherSortedBinaryIndexFieldData; import org.elasticsearch.index.fielddata.StoredFieldSortedBinaryIndexFieldData; import org.elasticsearch.index.fieldvisitor.StoredFieldLoader; @@ -301,17 +297,12 @@ private IOFunction, IOExcepti if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent && keywordParent.ignoreAbove().valuesPotentiallyIgnored()) { + final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName(); if (parent.isStored()) { - return combineFieldFetchers( - storedFieldFetcher(parentFieldName), - ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName()) - ); + return storedFieldFetcher(parentFieldName, parentFallbackFieldName); } else if (parent.hasDocValues()) { var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH); - return combineFieldFetchers( - docValuesFieldFetcher(ifd), - ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName()) - ); + return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName)); } } @@ -334,16 +325,22 @@ private IOFunction, IOExcepti final KeywordFieldMapper.KeywordFieldType keywordDelegate ) { if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) { - String delegateFieldName = keywordDelegate.name(); - // bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this - // match_only_text field + // because we don't know whether the delegate field will be ignored during parsing, we must also check the current field + String fieldName = name(); String fallbackName = syntheticSourceFallbackFieldName(); + // delegate field names + String delegateFieldName = keywordDelegate.name(); + String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName(); + if (keywordDelegate.isStored()) { - return storedFieldFetcher(delegateFieldName, fallbackName); + return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName); } else if (keywordDelegate.hasDocValues()) { var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH); - return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName)); + return combineFieldFetchers( + docValuesFieldFetcher(ifd), + storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName) + ); } } @@ -358,34 +355,25 @@ private IOFunction, IOExcepti } } - private IOFunction, IOException>> docValuesFieldFetcher(IndexFieldData ifd) { - return context -> { - SortedBinaryDocValues indexedValuesDocValues = ifd.load(context).getBytesValues(); - return docId -> getValuesFromDocValues(indexedValuesDocValues, docId); - }; - } - - private IOFunction, IOException>> ignoredValuesDocValuesFieldFetcher( - String fieldName + private static IOFunction, IOException>> docValuesFieldFetcher( + IndexFieldData ifd ) { return context -> { - CustomBinaryDocValues ignoredValuesDocValues = new CustomBinaryDocValues(DocValues.getBinary(context.reader(), fieldName)); - return docId -> getValuesFromDocValues(ignoredValuesDocValues, docId); + var sortedBinaryDocValues = ifd.load(context).getBytesValues(); + return docId -> { + if (sortedBinaryDocValues.advanceExact(docId)) { + var values = new ArrayList<>(sortedBinaryDocValues.docValueCount()); + for (int i = 0; i < sortedBinaryDocValues.docValueCount(); i++) { + values.add(sortedBinaryDocValues.nextValue().utf8ToString()); + } + return values; + } else { + return List.of(); + } + }; }; } - private List getValuesFromDocValues(SortedBinaryDocValues docValues, int docId) throws IOException { - if (docValues.advanceExact(docId)) { - var values = new ArrayList<>(docValues.docValueCount()); - for (int i = 0; i < docValues.docValueCount(); i++) { - values.add(docValues.nextValue().utf8ToString()); - } - return values; - } else { - return List.of(); - } - } - private static IOFunction, IOException>> storedFieldFetcher(String... names) { var loader = StoredFieldLoader.create(false, Set.of(names)); return context -> { @@ -791,46 +779,4 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException { return fieldLoader; } - - /** - * A wrapper around {@link BinaryDocValues} that exposes some quality of life functions. Note, these values are not sorted. - */ - private static class CustomBinaryDocValues extends SortedBinaryDocValues { - - private final BinaryDocValues binaryDocValues; - private final ByteArrayStreamInput stream; - - private int docValueCount = 0; - - CustomBinaryDocValues(BinaryDocValues binaryDocValues) { - this.binaryDocValues = binaryDocValues; - this.stream = new ByteArrayStreamInput(); - } - - @Override - public BytesRef nextValue() throws IOException { - // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt() - return stream.readBytesRef(); - } - - @Override - public boolean advanceExact(int docId) throws IOException { - // if document has a value, read underlying bytes - if (binaryDocValues.advanceExact(docId)) { - BytesRef docValuesBytes = binaryDocValues.binaryValue(); - stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length); - docValueCount = stream.readVInt(); - return true; - } - - // otherwise there is nothing to do - docValueCount = 0; - return false; - } - - @Override - public int docValueCount() { - return docValueCount; - } - } } diff --git a/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml b/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml index 581841df3fe52..0050618beeb67 100644 --- a/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml +++ b/modules/mapper-extras/src/yamlRestTest/resources/rest-api-spec/test/match_only_text/10_basic.yml @@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent: id: "1" refresh: true body: - foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ] + foo: [ "Apache Lucene powers Elasticsearch", "Apache" ] - do: search: @@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent: - match: { "hits.total.value": 1 } - match: - hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ] + hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ] --- synthetic_source match_only_text as multi-field with stored keyword as parent: diff --git a/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java b/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java deleted file mode 100644 index 1f0c0be1f9555..0000000000000 --- a/server/src/main/java/org/elasticsearch/index/mapper/BinaryDocValuesSyntheticFieldLoaderLayer.java +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the "Elastic License - * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side - * Public License v 1"; you may not use this file except in compliance with, at - * your election, the "Elastic License 2.0", the "GNU Affero General Public - * License v3.0 only", or the "Server Side Public License, v 1". - */ - -package org.elasticsearch.index.mapper; - -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.util.BytesRef; -import org.elasticsearch.common.io.stream.ByteArrayStreamInput; -import org.elasticsearch.xcontent.XContentBuilder; - -import java.io.IOException; - -public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer { - - private final String fieldName; - - // the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read - // the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]... - private final ByteArrayStreamInput stream; - private int valueCount; - - public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) { - this.fieldName = fieldName; - this.stream = new ByteArrayStreamInput(); - } - - @Override - public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { - BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName); - - // there are no values associated with this field - if (docValues == null) { - valueCount = 0; - return null; - } - - return docId -> { - // there are no more documents to process - if (docValues.advanceExact(docId) == false) { - valueCount = 0; - return false; - } - - // otherwise, extract the doc values into a stream to later read from - BytesRef docValuesBytes = docValues.binaryValue(); - stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length); - valueCount = stream.readVInt(); - - return hasValue(); - }; - } - - @Override - public void write(XContentBuilder b) throws IOException { - for (int i = 0; i < valueCount; i++) { - // this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt() - BytesRef valueBytes = stream.readBytesRef(); - b.value(valueBytes.utf8ToString()); - } - } - - @Override - public boolean hasValue() { - return valueCount > 0; - } - - @Override - public long valueCount() { - return valueCount; - } - - @Override - public String fieldName() { - return fieldName; - } - -} diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 710e79aa0d016..5c48f8182b1f6 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -40,8 +40,6 @@ import org.apache.lucene.util.automaton.CompiledAutomaton; import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE; import org.apache.lucene.util.automaton.Operations; -import org.elasticsearch.ElasticsearchException; -import org.elasticsearch.common.io.stream.BytesStreamOutput; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.lucene.search.AutomatonQueries; @@ -91,7 +89,6 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; import java.util.Map; @@ -1174,14 +1171,7 @@ private boolean indexValue(DocumentParserContext context, XContentString value) var utfBytes = value.bytes(); var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length()); final String fieldName = fieldType().syntheticSourceFallbackFieldName(); - - // store the value in a binary doc values field, create one if it doesn't exist - MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName); - if (field == null) { - field = new MultiValuedBinaryDocValuesField(fieldName); - context.doc().addWithKey(fieldName, field); - } - field.add(bytesRef); + context.doc().add(new StoredField(fieldName, bytesRef)); } return false; @@ -1344,56 +1334,15 @@ protected BytesRef preserve(BytesRef value) { // extra copy of the field for supporting synthetic source. This layer will check that copy. if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) { final String fieldName = fieldType().syntheticSourceFallbackFieldName(); - layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName)); - } - - return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers); - } - - /** - * A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique - * binary doc values for fields with multiple values per document. - */ - private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField { - - private final Set uniqueValues; - private int docValuesByteCount = 0; - - MultiValuedBinaryDocValuesField(String name) { - super(name); - // linked hash set to maintain insertion order of elements - uniqueValues = new LinkedHashSet<>(); - } - - public void add(final BytesRef value) { - if (uniqueValues.add(value)) { - // might as well track these on the go as opposed to having to loop through all entries later - docValuesByteCount += value.length; - } - } - - /** - * Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes - * the form of [doc value count][length of value 1][value 1][length of value 2][value 2]... - */ - @Override - public BytesRef binaryValue() { - int docValuesCount = uniqueValues.size(); - // the + 1 is for the total doc values count, which is prefixed at the start of the array - int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES; - - try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) { - out.writeVInt(docValuesCount); - for (BytesRef value : uniqueValues) { - int valueLength = value.length; - out.writeVInt(valueLength); - out.writeBytes(value.bytes, value.offset, valueLength); + layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) { + @Override + protected void writeValue(Object value, XContentBuilder b) throws IOException { + BytesRef ref = (BytesRef) value; + b.utf8Value(ref.bytes, ref.offset, ref.length); } - return out.bytes().toBytesRef(); - } catch (IOException e) { - throw new ElasticsearchException("Failed to get binary value", e); - } + }); } + return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers); } } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java b/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java index ec71e07fc9231..41e0c644ee20e 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/KeywordSyntheticSourceNativeArrayIntegrationTests.java @@ -52,7 +52,6 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception { .endObject() .endObject() .endObject(); - // Note values that would be ignored are added at the end of arrays, // this makes testing easier as ignored values are always synthesized after regular values: var arrayValues = new Object[][] { @@ -61,16 +60,7 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception { new Object[] { "123", "1234", "12345" }, new Object[] { null, null, null, "blabla" }, new Object[] { "1", "2", "3", "blabla" } }; - - // values in the original array should be deduplicated - var expectedArrayValues = new Object[][] { - new Object[] { null, "a", "ab", "abc", "abcd", null, "abcde" }, - new Object[] { "12345" }, - new Object[] { "123", "1234", "12345" }, - new Object[] { null, null, null, "blabla" }, - new Object[] { "1", "2", "3", "blabla" } }; - - verifySyntheticArray(arrayValues, expectedArrayValues, mapping, "_id"); + verifySyntheticArray(arrayValues, mapping, "_id", "field._original"); } public void testSynthesizeObjectArray() throws Exception { diff --git a/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java b/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java index 950626292d120..d1ab3c0907562 100644 --- a/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java +++ b/test/framework/src/main/java/org/elasticsearch/index/mapper/NativeArrayIntegrationTestCase.java @@ -259,17 +259,11 @@ protected void verifySyntheticArray(Object[][] arrays, XContentBuilder mapping, private XContentBuilder arrayToSource(Object[] array) throws IOException { var source = jsonBuilder().startObject(); if (array != null) { - // collapse array if it only consists of one element - // if the only element is null, then we'll skip synthesizing source for that field - if (array.length == 1 && array[0] != null) { - source.field("field", array[0]); - } else { - source.startArray("field"); - for (Object arrayValue : array) { - source.value(arrayValue); - } - source.endArray(); + source.startArray("field"); + for (Object arrayValue : array) { + source.value(arrayValue); } + source.endArray(); } else { source.field("field").nullValue(); } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 6dbf619e0c140..ad28b0336d855 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -17,8 +17,10 @@ import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StoredField; +import org.apache.lucene.index.BinaryDocValues; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; +import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause.Occur; @@ -41,6 +43,7 @@ import org.apache.lucene.util.automaton.RegExp; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.common.geo.ShapeRelation; +import org.elasticsearch.common.io.stream.ByteArrayStreamInput; import org.elasticsearch.common.lucene.BytesRefs; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.time.DateMathParser; @@ -55,7 +58,6 @@ import org.elasticsearch.index.fielddata.FieldDataContext; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.plain.StringBinaryIndexFieldData; -import org.elasticsearch.index.mapper.BinaryDocValuesSyntheticFieldLoaderLayer; import org.elasticsearch.index.mapper.BinaryFieldMapper.CustomBinaryDocValuesField; import org.elasticsearch.index.mapper.BlockLoader; import org.elasticsearch.index.mapper.CompositeSyntheticFieldLoader; @@ -1104,7 +1106,7 @@ public FieldMapper.Builder getMergeBuilder() { protected SyntheticSourceSupport syntheticSourceSupport() { return new SyntheticSourceSupport.Native(() -> { var layers = new ArrayList(); - layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fullPath())); + layers.add(new WildcardSyntheticFieldLoader()); if (ignoreAbove.valuesPotentiallyIgnored()) { layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(originalName()) { @Override @@ -1118,4 +1120,54 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException { }); } + private class WildcardSyntheticFieldLoader implements CompositeSyntheticFieldLoader.DocValuesLayer { + private final ByteArrayStreamInput docValuesStream = new ByteArrayStreamInput(); + private int docValueCount; + private BytesRef docValueBytes; + + @Override + public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException { + BinaryDocValues values = leafReader.getBinaryDocValues(fullPath()); + if (values == null) { + docValueCount = 0; + return null; + } + + return docId -> { + if (values.advanceExact(docId) == false) { + docValueCount = 0; + return hasValue(); + } + docValueBytes = values.binaryValue(); + docValuesStream.reset(docValueBytes.bytes); + docValuesStream.setPosition(docValueBytes.offset); + docValueCount = docValuesStream.readVInt(); + return hasValue(); + }; + } + + @Override + public boolean hasValue() { + return docValueCount > 0; + } + + @Override + public long valueCount() { + return docValueCount; + } + + @Override + public void write(XContentBuilder b) throws IOException { + for (int i = 0; i < docValueCount; i++) { + int length = docValuesStream.readVInt(); + b.utf8Value(docValueBytes.bytes, docValuesStream.getPosition(), length); + docValuesStream.skipBytes(length); + } + } + + @Override + public String fieldName() { + return fullPath(); + } + } }