Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
Expand All @@ -30,6 +32,7 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOFunction;
import org.elasticsearch.common.CheckedIntFunction;
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.text.UTF8DecodingReader;
import org.elasticsearch.common.unit.Fuzziness;
Expand All @@ -39,6 +42,7 @@
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.fielddata.FieldDataContext;
import org.elasticsearch.index.fielddata.IndexFieldData;
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
import org.elasticsearch.index.fielddata.SourceValueFetcherSortedBinaryIndexFieldData;
import org.elasticsearch.index.fielddata.StoredFieldSortedBinaryIndexFieldData;
import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
Expand Down Expand Up @@ -297,12 +301,17 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti

if (parent instanceof KeywordFieldMapper.KeywordFieldType keywordParent
&& keywordParent.ignoreAbove().valuesPotentiallyIgnored()) {
final String parentFallbackFieldName = keywordParent.syntheticSourceFallbackFieldName();
if (parent.isStored()) {
return storedFieldFetcher(parentFieldName, parentFallbackFieldName);
return combineFieldFetchers(
storedFieldFetcher(parentFieldName),
ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
);
} else if (parent.hasDocValues()) {
var ifd = searchExecutionContext.getForField(parent, MappedFieldType.FielddataOperation.SEARCH);
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(parentFallbackFieldName));
return combineFieldFetchers(
docValuesFieldFetcher(ifd),
ignoredValuesDocValuesFieldFetcher(keywordParent.syntheticSourceFallbackFieldName())
);
}
}

Expand All @@ -325,22 +334,16 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
final KeywordFieldMapper.KeywordFieldType keywordDelegate
) {
if (keywordDelegate.ignoreAbove().valuesPotentiallyIgnored()) {
// because we don't know whether the delegate field will be ignored during parsing, we must also check the current field
String fieldName = name();
String fallbackName = syntheticSourceFallbackFieldName();

// delegate field names
String delegateFieldName = keywordDelegate.name();
String delegateFieldFallbackName = keywordDelegate.syntheticSourceFallbackFieldName();
// bc we don't know whether the delegate will ignore a value, we must also check the fallback field created by this
// match_only_text field
String fallbackName = syntheticSourceFallbackFieldName();

if (keywordDelegate.isStored()) {
return storedFieldFetcher(delegateFieldName, delegateFieldFallbackName, fieldName, fallbackName);
return storedFieldFetcher(delegateFieldName, fallbackName);
} else if (keywordDelegate.hasDocValues()) {
var ifd = searchExecutionContext.getForField(keywordDelegate, MappedFieldType.FielddataOperation.SEARCH);
return combineFieldFetchers(
docValuesFieldFetcher(ifd),
storedFieldFetcher(delegateFieldFallbackName, fieldName, fallbackName)
);
return combineFieldFetchers(docValuesFieldFetcher(ifd), storedFieldFetcher(fallbackName));
}
}

Expand All @@ -355,25 +358,34 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
}
}

private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(
IndexFieldData<?> ifd
private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> docValuesFieldFetcher(IndexFieldData<?> ifd) {
return context -> {
SortedBinaryDocValues indexedValuesDocValues = ifd.load(context).getBytesValues();
return docId -> getValuesFromDocValues(indexedValuesDocValues, docId);
};
}

private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> ignoredValuesDocValuesFieldFetcher(
String fieldName
) {
return context -> {
var sortedBinaryDocValues = ifd.load(context).getBytesValues();
return docId -> {
if (sortedBinaryDocValues.advanceExact(docId)) {
var values = new ArrayList<>(sortedBinaryDocValues.docValueCount());
for (int i = 0; i < sortedBinaryDocValues.docValueCount(); i++) {
values.add(sortedBinaryDocValues.nextValue().utf8ToString());
}
return values;
} else {
return List.of();
}
};
CustomBinaryDocValues ignoredValuesDocValues = new CustomBinaryDocValues(DocValues.getBinary(context.reader(), fieldName));
return docId -> getValuesFromDocValues(ignoredValuesDocValues, docId);
};
}

private List<Object> getValuesFromDocValues(SortedBinaryDocValues docValues, int docId) throws IOException {
if (docValues.advanceExact(docId)) {
var values = new ArrayList<>(docValues.docValueCount());
for (int i = 0; i < docValues.docValueCount(); i++) {
values.add(docValues.nextValue().utf8ToString());
}
return values;
} else {
return List.of();
}
}

private static IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> storedFieldFetcher(String... names) {
var loader = StoredFieldLoader.create(false, Set.of(names));
return context -> {
Expand Down Expand Up @@ -779,4 +791,46 @@ protected void writeValue(Object value, XContentBuilder b) throws IOException {

return fieldLoader;
}

/**
* A wrapper around {@link BinaryDocValues} that exposes some quality of life functions. Note, these values are not sorted.
*/
private static class CustomBinaryDocValues extends SortedBinaryDocValues {

private final BinaryDocValues binaryDocValues;
private final ByteArrayStreamInput stream;

private int docValueCount = 0;

CustomBinaryDocValues(BinaryDocValues binaryDocValues) {
this.binaryDocValues = binaryDocValues;
this.stream = new ByteArrayStreamInput();
}

@Override
public BytesRef nextValue() throws IOException {
// this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
return stream.readBytesRef();
}

@Override
public boolean advanceExact(int docId) throws IOException {
// if document has a value, read underlying bytes
if (binaryDocValues.advanceExact(docId)) {
BytesRef docValuesBytes = binaryDocValues.binaryValue();
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
docValueCount = stream.readVInt();
return true;
}

// otherwise there is nothing to do
docValueCount = 0;
return false;
}

@Override
public int docValueCount() {
return docValueCount;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:
id: "1"
refresh: true
body:
foo: [ "Apache Lucene powers Elasticsearch", "Apache" ]
foo: [ "Apache Lucene powers Elasticsearch", "Apache", "Apache Lucene" ]

- do:
search:
Expand All @@ -477,7 +477,7 @@ synthetic_source match_only_text as multi-field with ignored keyword as parent:

- match: { "hits.total.value": 1 }
- match:
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch" ]
hits.hits.0._source.foo: [ "Apache", "Apache Lucene powers Elasticsearch", "Apache Lucene" ]

---
synthetic_source match_only_text as multi-field with stored keyword as parent:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the "Elastic License
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side
* Public License v 1"; you may not use this file except in compliance with, at
* your election, the "Elastic License 2.0", the "GNU Affero General Public
* License v3.0 only", or the "Server Side Public License, v 1".
*/

package org.elasticsearch.index.mapper;

import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.io.stream.ByteArrayStreamInput;
import org.elasticsearch.xcontent.XContentBuilder;

import java.io.IOException;

public final class BinaryDocValuesSyntheticFieldLoaderLayer implements CompositeSyntheticFieldLoader.DocValuesLayer {

private final String fieldName;

// the binary doc values for a document are all encoded in a single binary array, which this stream knows how to read
// the doc values in the array take the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
private final ByteArrayStreamInput stream;
private int valueCount;

public BinaryDocValuesSyntheticFieldLoaderLayer(String fieldName) {
this.fieldName = fieldName;
this.stream = new ByteArrayStreamInput();
}

@Override
public DocValuesLoader docValuesLoader(LeafReader leafReader, int[] docIdsInLeaf) throws IOException {
BinaryDocValues docValues = leafReader.getBinaryDocValues(fieldName);

// there are no values associated with this field
if (docValues == null) {
valueCount = 0;
return null;
}

return docId -> {
// there are no more documents to process
if (docValues.advanceExact(docId) == false) {
valueCount = 0;
return false;
}

// otherwise, extract the doc values into a stream to later read from
BytesRef docValuesBytes = docValues.binaryValue();
stream.reset(docValuesBytes.bytes, docValuesBytes.offset, docValuesBytes.length);
valueCount = stream.readVInt();

return hasValue();
};
}

@Override
public void write(XContentBuilder b) throws IOException {
for (int i = 0; i < valueCount; i++) {
// this function already knows how to decode the underlying bytes array, so no need to explicitly call VInt()
BytesRef valueBytes = stream.readBytesRef();
b.value(valueBytes.utf8ToString());
}
}

@Override
public boolean hasValue() {
return valueCount > 0;
}

@Override
public long valueCount() {
return valueCount;
}

@Override
public String fieldName() {
return fieldName;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton.AUTOMATON_TYPE;
import org.apache.lucene.util.automaton.Operations;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.lucene.BytesRefs;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.AutomatonQueries;
Expand Down Expand Up @@ -87,6 +89,7 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
Expand Down Expand Up @@ -1153,7 +1156,14 @@ private boolean indexValue(DocumentParserContext context, XContentString value)
var utfBytes = value.bytes();
var bytesRef = new BytesRef(utfBytes.bytes(), utfBytes.offset(), utfBytes.length());
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
context.doc().add(new StoredField(fieldName, bytesRef));

// store the value in a binary doc values field, create one if it doesn't exist
MultiValuedBinaryDocValuesField field = (MultiValuedBinaryDocValuesField) context.doc().getByKey(fieldName);
if (field == null) {
field = new MultiValuedBinaryDocValuesField(fieldName);
context.doc().addWithKey(fieldName, field);
}
field.add(bytesRef);
Comment on lines -1156 to +1166
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another BWC issue - old indices will have conflicting fieldinfos for <name>._original since we're switching from using a stored field to binary doc values.

}

return false;
Expand Down Expand Up @@ -1316,15 +1326,56 @@ protected BytesRef preserve(BytesRef value) {
// extra copy of the field for supporting synthetic source. This layer will check that copy.
if (fieldType().ignoreAbove.valuesPotentiallyIgnored()) {
final String fieldName = fieldType().syntheticSourceFallbackFieldName();
layers.add(new CompositeSyntheticFieldLoader.StoredFieldLayer(fieldName) {
@Override
protected void writeValue(Object value, XContentBuilder b) throws IOException {
BytesRef ref = (BytesRef) value;
b.utf8Value(ref.bytes, ref.offset, ref.length);
}
});
layers.add(new BinaryDocValuesSyntheticFieldLoaderLayer(fieldName));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this break BWC? Because old indices will have their ignored values stored in a stored field, and now the mapper can no longer load those values?

}

return new CompositeSyntheticFieldLoader(leafFieldName, fullFieldName, layers);
}

/**
* A custom implementation of {@link org.apache.lucene.index.BinaryDocValues} that uses a {@link Set} to maintain a collection of unique
* binary doc values for fields with multiple values per document.
*/
private static final class MultiValuedBinaryDocValuesField extends CustomDocValuesField {

private final Set<BytesRef> uniqueValues;
private int docValuesByteCount = 0;

MultiValuedBinaryDocValuesField(String name) {
super(name);
// linked hash set to maintain insertion order of elements
uniqueValues = new LinkedHashSet<>();
}

public void add(final BytesRef value) {
if (uniqueValues.add(value)) {
// might as well track these on the go as opposed to having to loop through all entries later
docValuesByteCount += value.length;
}
}

/**
* Encodes the collection of binary doc values as a single contiguous binary array, wrapped in {@link BytesRef}. This array takes
* the form of [doc value count][length of value 1][value 1][length of value 2][value 2]...
*/
@Override
public BytesRef binaryValue() {
int docValuesCount = uniqueValues.size();
// the + 1 is for the total doc values count, which is prefixed at the start of the array
int streamSize = docValuesByteCount + (docValuesCount + 1) * Integer.BYTES;

try (BytesStreamOutput out = new BytesStreamOutput(streamSize)) {
out.writeVInt(docValuesCount);
for (BytesRef value : uniqueValues) {
int valueLength = value.length;
out.writeVInt(valueLength);
out.writeBytes(value.bytes, value.offset, valueLength);
}
return out.bytes().toBytesRef();
} catch (IOException e) {
throw new ElasticsearchException("Failed to get binary value", e);
}
}

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception {
.endObject()
.endObject()
.endObject();

// Note values that would be ignored are added at the end of arrays,
// this makes testing easier as ignored values are always synthesized after regular values:
var arrayValues = new Object[][] {
Expand All @@ -60,7 +61,16 @@ public void testSynthesizeArrayIgnoreAbove() throws Exception {
new Object[] { "123", "1234", "12345" },
new Object[] { null, null, null, "blabla" },
new Object[] { "1", "2", "3", "blabla" } };
verifySyntheticArray(arrayValues, mapping, "_id", "field._original");

// values in the original array should be deduplicated
var expectedArrayValues = new Object[][] {
new Object[] { null, "a", "ab", "abc", "abcd", null, "abcde" },
new Object[] { "12345" },
new Object[] { "123", "1234", "12345" },
new Object[] { null, null, null, "blabla" },
new Object[] { "1", "2", "3", "blabla" } };

verifySyntheticArray(arrayValues, expectedArrayValues, mapping, "_id");
}

public void testSynthesizeObjectArray() throws Exception {
Expand Down
Loading