From 78c6dabb725d374dac80a4bcfda42c97d6bf14b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Wed, 19 Nov 2025 15:41:32 +0100 Subject: [PATCH 1/7] Add new PerFieldStoredFieldsFormat --- server/src/main/java/module-info.java | 8 + .../ES93BloomFilterStoredFieldsFormat.java | 164 ++++----- .../ESLucene90StoredFieldsFormat.java | 47 +++ .../storedfields/ESStoredFieldsFormat.java | 61 ++++ .../ESZstd814StoredFieldsFormat.java | 39 ++ .../FilterESStoredFieldsFormat.java | 39 ++ .../PerFieldStoredFieldsFormat.java | 342 ++++++++++++++++++ ...ex.codec.storedfields.ESStoredFieldsFormat | 3 + ...S93BloomFilterStoredFieldsFormatTests.java | 143 +++++--- .../PerFieldStoredFieldsFormatTests.java | 119 ++++++ 10 files changed, 823 insertions(+), 142 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/codec/storedfields/ESLucene90StoredFieldsFormat.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/storedfields/ESStoredFieldsFormat.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/storedfields/ESZstd814StoredFieldsFormat.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/storedfields/FilterESStoredFieldsFormat.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormat.java create mode 100644 server/src/main/resources/META-INF/services/org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat create mode 100644 server/src/test/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormatTests.java diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 9c5d11e1cf9e1..33d512b046a99 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -245,6 +245,7 @@ exports org.elasticsearch.index.codec; exports org.elasticsearch.index.codec.tsdb; exports org.elasticsearch.index.codec.bloomfilter; + exports org.elasticsearch.index.codec.storedfields; exports org.elasticsearch.index.codec.zstd; exports org.elasticsearch.index.engine; exports org.elasticsearch.index.fielddata; @@ -468,6 +469,13 @@ org.elasticsearch.index.codec.vectors.es93.ES93BinaryQuantizedVectorsFormat, org.elasticsearch.index.codec.vectors.es93.ES93HnswVectorsFormat, org.elasticsearch.index.codec.vectors.es93.ES93HnswBinaryQuantizedVectorsFormat; + provides org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat + with + org.elasticsearch.index.codec.storedfields.ESZstd814StoredFieldsFormat, + org.elasticsearch.index.codec.storedfields.ESLucene90StoredFieldsFormat, + org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat; + + uses org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; provides org.apache.lucene.codecs.Codec with diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java index 9cbbb9879523a..f547d6cd4b7c1 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -11,7 +11,6 @@ import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsProducer; -import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.index.CorruptIndexException; @@ -40,12 +39,14 @@ import org.elasticsearch.common.util.ByteArray; import org.elasticsearch.core.IOUtils; import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Objects; +import java.util.Set; import java.util.function.IntSupplier; import static org.elasticsearch.index.codec.bloomfilter.BloomFilterHashFunctions.MurmurHash3.hash64; @@ -72,10 +73,14 @@ * be found in {@link BloomFilterMetadata}. * */ -public class ES93BloomFilterStoredFieldsFormat extends StoredFieldsFormat { +public class ES93BloomFilterStoredFieldsFormat extends ESStoredFieldsFormat { public static final String STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME = "ES93BloomFilterStoredFieldsFormat"; public static final String STORED_FIELDS_BLOOM_FILTER_EXTENSION = "sfbf"; public static final String STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION = "sfbfm"; + private static final Set FILE_EXTENSIONS = Set.of( + STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION, + STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME + ); private static final int VERSION_START = 0; private static final int VERSION_CURRENT = VERSION_START; @@ -85,24 +90,26 @@ public class ES93BloomFilterStoredFieldsFormat extends StoredFieldsFormat { private static final byte BLOOM_FILTER_STORED = 1; private static final byte BLOOM_FILTER_NOT_STORED = 0; private static final ByteSizeValue MAX_BLOOM_FILTER_SIZE = ByteSizeValue.ofMb(8); + private static final String DEFAULT_SEGMENT_SUFFIX = ""; + public static final ByteSizeValue DEFAULT_BLOOM_FILTER_SIZE = ByteSizeValue.ofKb(2); private final BigArrays bigArrays; - private final String segmentSuffix; - private final StoredFieldsFormat delegate; private final String bloomFilterFieldName; private final int numHashFunctions; private final int bloomFilterSizeInBits; - public ES93BloomFilterStoredFieldsFormat( - BigArrays bigArrays, - String segmentSuffix, - StoredFieldsFormat delegate, - ByteSizeValue bloomFilterSize, - String bloomFilterFieldName - ) { + // Public constructor SPI use for reads only + public ES93BloomFilterStoredFieldsFormat() { + super(STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME); + bigArrays = null; + bloomFilterFieldName = null; + numHashFunctions = 0; + bloomFilterSizeInBits = 0; + } + + public ES93BloomFilterStoredFieldsFormat(BigArrays bigArrays, ByteSizeValue bloomFilterSize, String bloomFilterFieldName) { + super(STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME); this.bigArrays = bigArrays; - this.segmentSuffix = segmentSuffix; - this.delegate = delegate; this.bloomFilterFieldName = bloomFilterFieldName; this.numHashFunctions = DEFAULT_NUM_HASH_FUNCTIONS; @@ -115,29 +122,28 @@ public ES93BloomFilterStoredFieldsFormat( @Override public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { - return new Reader(directory, si, fn, context, segmentSuffix, delegate.fieldsReader(directory, si, fn, context)); + return new Reader(directory, si, fn, context); } @Override public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { + assert bigArrays != null; + assert bloomFilterFieldName != null; + assert numHashFunctions > 0; + assert bloomFilterSizeInBits > 0; // TODO: compute the bloom filter size based on heuristics and oversize factor - return new Writer( - directory, - si, - context, - segmentSuffix, - bigArrays, - numHashFunctions, - this::getBloomFilterSizeInBits, - bloomFilterFieldName, - delegate.fieldsWriter(directory, si, context) - ); + return new Writer(directory, si, context, bigArrays, numHashFunctions, this::getBloomFilterSizeInBits, bloomFilterFieldName); } int getBloomFilterSizeInBits() { return bloomFilterSizeInBits; } + @Override + protected Set getFileExtensions() { + return FILE_EXTENSIONS; + } + static int closestPowerOfTwoBloomFilterSizeInBits(ByteSizeValue bloomFilterSize) { var closestPowerOfTwoBloomFilterSizeInBytes = Long.highestOneBit(bloomFilterSize.getBytes()); if (closestPowerOfTwoBloomFilterSizeInBytes > MAX_BLOOM_FILTER_SIZE.getBytes()) { @@ -157,12 +163,10 @@ static class Writer extends StoredFieldsWriter { private final Directory directory; private final SegmentInfo segmentInfo; private final IOContext context; - private final String segmentSuffix; private final BigArrays bigArrays; private final IntSupplier defaultBloomFilterSizeInBitsSupplier; private final int numHashFunctions; private final String bloomFilterFieldName; - private final StoredFieldsWriter delegateWriter; private final List toClose = new ArrayList<>(); private final IndexOutput metadataOut; @@ -172,17 +176,14 @@ static class Writer extends StoredFieldsWriter { Directory directory, SegmentInfo segmentInfo, IOContext context, - String segmentSuffix, BigArrays bigArrays, int numHashFunctions, IntSupplier defaultBloomFilterSizeInBitsSupplier, - String bloomFilterFieldName, - StoredFieldsWriter delegateWriter + String bloomFilterFieldName ) throws IOException { this.directory = directory; this.segmentInfo = segmentInfo; this.context = context; - this.segmentSuffix = segmentSuffix; this.bigArrays = bigArrays; this.defaultBloomFilterSizeInBitsSupplier = defaultBloomFilterSizeInBitsSupplier; assert numHashFunctions <= PRIMES.length @@ -191,19 +192,16 @@ static class Writer extends StoredFieldsWriter { this.numHashFunctions = numHashFunctions; this.bloomFilterFieldName = bloomFilterFieldName; - this.delegateWriter = delegateWriter; - toClose.add(delegateWriter); - boolean success = false; try { - metadataOut = directory.createOutput(bloomFilterMetadataFileName(segmentInfo, segmentSuffix), context); + metadataOut = directory.createOutput(bloomFilterMetadataFileName(segmentInfo), context); toClose.add(metadataOut); CodecUtil.writeIndexHeader( metadataOut, STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, VERSION_CURRENT, segmentInfo.getId(), - segmentSuffix + DEFAULT_SEGMENT_SUFFIX ); success = true; @@ -215,55 +213,43 @@ static class Writer extends StoredFieldsWriter { } @Override - public void startDocument() throws IOException { - delegateWriter.startDocument(); + public void startDocument() { + } @Override - public void finishDocument() throws IOException { - delegateWriter.finishDocument(); + public void finishDocument() { + } @Override public void writeField(FieldInfo info, int value) throws IOException { - if (isBloomFilterField(info) == false) { - delegateWriter.writeField(info, value); - } + throwUnsupported(info, "int"); } @Override public void writeField(FieldInfo info, long value) throws IOException { - if (isBloomFilterField(info) == false) { - delegateWriter.writeField(info, value); - } + throwUnsupported(info, "long"); } @Override public void writeField(FieldInfo info, float value) throws IOException { - if (isBloomFilterField(info) == false) { - delegateWriter.writeField(info, value); - } + throwUnsupported(info, "float"); } @Override public void writeField(FieldInfo info, double value) throws IOException { - if (isBloomFilterField(info) == false) { - delegateWriter.writeField(info, value); - } + throwUnsupported(info, "double"); } @Override public void writeField(FieldInfo info, StoredFieldDataInput value) throws IOException { - if (isBloomFilterField(info) == false) { - delegateWriter.writeField(info, value); - } + throwUnsupported(info, "StoredFieldDataInput"); } @Override public void writeField(FieldInfo info, String value) throws IOException { - if (isBloomFilterField(info) == false) { - delegateWriter.writeField(info, value); - } + throwUnsupported(info, "String"); } @Override @@ -271,10 +257,16 @@ public void writeField(FieldInfo info, BytesRef value) throws IOException { if (isBloomFilterField(info)) { addToBloomFilter(info, value); } else { - delegateWriter.writeField(info, value); + throw new IllegalArgumentException("Bloom filter field [" + info.name + "] is not supported"); } } + private void throwUnsupported(FieldInfo info, String dataType) { + throw new UnsupportedOperationException( + "writeField operation not supported for field '" + info.name + "' with type " + dataType + ); + } + private boolean isBloomFilterField(FieldInfo info) { return (bloomFilterWriter != null && bloomFilterWriter.fieldInfo.getFieldNumber() == info.getFieldNumber()) || info.getName().equals(bloomFilterFieldName); @@ -295,7 +287,6 @@ private void addToBloomFilter(FieldInfo info, BytesRef value) throws IOException @Override public void finish(int numDocs) throws IOException { finishBloomFilterStoredFormat(); - delegateWriter.finish(numDocs); } private void finishBloomFilterStoredFormat() throws IOException { @@ -318,7 +309,7 @@ public int merge(MergeState mergeState) throws IOException { rebuildBloomFilterFromSegments(mergeState); } finishBloomFilterStoredFormat(); - return delegateWriter.merge(mergeState); + return 0; } private void mergeOptimized(MergeState mergeState) throws IOException { @@ -432,7 +423,7 @@ public void close() throws IOException { @Override public long ramBytesUsed() { - return bloomFilterWriter == null ? 0 : bloomFilterWriter.buffer.ramBytesUsed() + delegateWriter.ramBytesUsed(); + return bloomFilterWriter == null ? 0 : bloomFilterWriter.buffer.ramBytesUsed(); } private void maybeInitializeBloomFilterWriter(FieldInfo fieldInfo, int bitSetSizeInBits) throws IOException { @@ -466,7 +457,7 @@ class BloomFilterWriter implements Closeable { this.bitSetSizeInBytes = bitsetSizeInBits / Byte.SIZE; this.buffer = bigArrays.newByteArray(bitSetSizeInBytes, false); this.hashes = new int[numHashFunctions]; - this.bloomFilterDataOut = directory.createOutput(bloomFilterFileName(segmentInfo, segmentSuffix), context); + this.bloomFilterDataOut = directory.createOutput(bloomFilterFileName(segmentInfo), context); boolean success = false; try { @@ -475,7 +466,7 @@ class BloomFilterWriter implements Closeable { STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, VERSION_CURRENT, segmentInfo.getId(), - segmentSuffix + DEFAULT_SEGMENT_SUFFIX ); success = true; } finally { @@ -565,26 +556,9 @@ public void close() throws IOException { private static class Reader extends StoredFieldsReader implements BloomFilterProvider { @Nullable private final BloomFilterFieldReader bloomFilterFieldReader; - private final StoredFieldsReader delegateReader; - Reader( - Directory directory, - SegmentInfo si, - FieldInfos fn, - IOContext context, - String segmentSuffix, - StoredFieldsReader delegateReader - ) throws IOException { - this.delegateReader = delegateReader; - var success = false; - try { - bloomFilterFieldReader = BloomFilterFieldReader.open(directory, si, fn, context, segmentSuffix); - success = true; - } finally { - if (success == false) { - delegateReader.close(); - } - } + Reader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + bloomFilterFieldReader = BloomFilterFieldReader.open(directory, si, fn, context); } @Override @@ -597,17 +571,16 @@ public void checkIntegrity() throws IOException { if (bloomFilterFieldReader != null) { bloomFilterFieldReader.checkIntegrity(); } - delegateReader.checkIntegrity(); } @Override public void close() throws IOException { - IOUtils.close(bloomFilterFieldReader, delegateReader); + IOUtils.close(bloomFilterFieldReader); } @Override public void document(int docID, StoredFieldVisitor visitor) throws IOException { - delegateReader.document(docID, visitor); + // TODO: read synthetic _id from doc values } @Override @@ -650,18 +623,17 @@ static class BloomFilterFieldReader implements BloomFilter { private final int[] hashes; @Nullable - static BloomFilterFieldReader open(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context, String segmentSuffix) - throws IOException { + static BloomFilterFieldReader open(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { List toClose = new ArrayList<>(); boolean success = false; - try (var metaInput = directory.openChecksumInput(bloomFilterMetadataFileName(si, segmentSuffix))) { + try (var metaInput = directory.openChecksumInput(bloomFilterMetadataFileName(si))) { var metadataVersion = CodecUtil.checkIndexHeader( metaInput, STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, VERSION_START, VERSION_CURRENT, si.getId(), - segmentSuffix + DEFAULT_SEGMENT_SUFFIX ); var hasBloomFilter = metaInput.readByte() == BLOOM_FILTER_STORED; if (hasBloomFilter == false) { @@ -670,7 +642,7 @@ static BloomFilterFieldReader open(Directory directory, SegmentInfo si, FieldInf BloomFilterMetadata bloomFilterMetadata = BloomFilterMetadata.readFrom(metaInput, fn); CodecUtil.checkFooter(metaInput); - IndexInput bloomFilterData = directory.openInput(bloomFilterFileName(si, segmentSuffix), context); + IndexInput bloomFilterData = directory.openInput(bloomFilterFileName(si), context); toClose.add(bloomFilterData); var bloomFilterDataVersion = CodecUtil.checkIndexHeader( bloomFilterData, @@ -678,7 +650,7 @@ static BloomFilterFieldReader open(Directory directory, SegmentInfo si, FieldInf VERSION_START, VERSION_CURRENT, si.getId(), - segmentSuffix + DEFAULT_SEGMENT_SUFFIX ); if (metadataVersion != bloomFilterDataVersion) { @@ -767,12 +739,12 @@ private static boolean isPowerOfTwo(int value) { return (value & (value - 1)) == 0; } - private static String bloomFilterMetadataFileName(SegmentInfo segmentInfo, String segmentSuffix) { - return IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION); + private static String bloomFilterMetadataFileName(SegmentInfo segmentInfo) { + return IndexFileNames.segmentFileName(segmentInfo.name, DEFAULT_SEGMENT_SUFFIX, STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION); } - private static String bloomFilterFileName(SegmentInfo segmentInfo, String segmentSuffix) { - return IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, STORED_FIELDS_BLOOM_FILTER_EXTENSION); + private static String bloomFilterFileName(SegmentInfo segmentInfo) { + return IndexFileNames.segmentFileName(segmentInfo.name, DEFAULT_SEGMENT_SUFFIX, STORED_FIELDS_BLOOM_FILTER_EXTENSION); } public interface BloomFilter extends Closeable { diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESLucene90StoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESLucene90StoredFieldsFormat.java new file mode 100644 index 0000000000000..2b639da2627ff --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESLucene90StoredFieldsFormat.java @@ -0,0 +1,47 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.storedfields; + +import org.apache.lucene.codecs.lucene103.Lucene103Codec; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter; + +import java.util.Set; + +/** + * Simple wrapper for Lucene90StoredFieldsFormat that allows it to be loaded through SPI + */ +public class ESLucene90StoredFieldsFormat extends FilterESStoredFieldsFormat { + public static final Set FILE_EXTENSIONS = Set.of( + Lucene90CompressingStoredFieldsWriter.FIELDS_EXTENSION, + Lucene90CompressingStoredFieldsWriter.INDEX_EXTENSION, + Lucene90CompressingStoredFieldsWriter.META_EXTENSION + ); + + public ESLucene90StoredFieldsFormat() { + this(Lucene103Codec.Mode.BEST_SPEED); + } + + public ESLucene90StoredFieldsFormat(Lucene103Codec.Mode mode) { + super( + "ESLucene90StoredFieldsFormat", + new Lucene90StoredFieldsFormat( + mode == Lucene103Codec.Mode.BEST_COMPRESSION + ? Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION + : Lucene90StoredFieldsFormat.Mode.BEST_SPEED + ) + ); + } + + @Override + protected Set getFileExtensions() { + return FILE_EXTENSIONS; + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESStoredFieldsFormat.java new file mode 100644 index 0000000000000..369f6a9bb029e --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESStoredFieldsFormat.java @@ -0,0 +1,61 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.storedfields; + +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.util.NamedSPILoader; + +import java.util.Set; + +/** + * A {@link StoredFieldsFormat} that can be loaded via SPI and provides a name for identification. + * This is required because {@link PerFieldStoredFieldsFormat} uses SPI to load stored field formats + * when reading fields. + */ +public abstract class ESStoredFieldsFormat extends StoredFieldsFormat implements NamedSPILoader.NamedSPI { + private static final class Holder { + public static final NamedSPILoader LOADER = new NamedSPILoader<>(ESStoredFieldsFormat.class); + + private Holder() {} + + static NamedSPILoader getLoader() { + if (LOADER == null) { + throw new IllegalStateException( + "You tried to lookup a ESStoredFieldsFormat by name before all formats could be initialized." + ); + } + return LOADER; + } + } + + public static ESStoredFieldsFormat forName(String name) { + return Holder.getLoader().lookup(name); + } + + /** + * Unique name that's used to retrieve this format when reading the index. + */ + private final String name; + + protected ESStoredFieldsFormat(String name) { + NamedSPILoader.checkServiceName(name); + this.name = name; + } + + @Override + public String getName() { + return name; + } + + /** + * Returns the set of file fileExtensions that this stored fields format would write to disk. + */ + protected abstract Set getFileExtensions(); +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESZstd814StoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESZstd814StoredFieldsFormat.java new file mode 100644 index 0000000000000..084fc2d2c8f8b --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESZstd814StoredFieldsFormat.java @@ -0,0 +1,39 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.storedfields; + +import org.apache.lucene.codecs.lucene90.compressing.Lucene90CompressingStoredFieldsWriter; +import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; + +import java.util.Set; + +/** + * Simple wrapper for Lucene90StoredFieldsFormat that uses zstd for compression. Allowing to be loaded through SPI. + */ +public class ESZstd814StoredFieldsFormat extends FilterESStoredFieldsFormat { + public static final Set FILE_EXTENSIONS = Set.of( + Lucene90CompressingStoredFieldsWriter.FIELDS_EXTENSION, + Lucene90CompressingStoredFieldsWriter.INDEX_EXTENSION, + Lucene90CompressingStoredFieldsWriter.META_EXTENSION + ); + + public ESZstd814StoredFieldsFormat() { + this(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); + } + + public ESZstd814StoredFieldsFormat(Zstd814StoredFieldsFormat.Mode mode) { + super("ESZstd814StoredFieldsFormat", mode.getFormat()); + } + + @Override + protected Set getFileExtensions() { + return FILE_EXTENSIONS; + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/FilterESStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/FilterESStoredFieldsFormat.java new file mode 100644 index 0000000000000..a3a154095e1ad --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/FilterESStoredFieldsFormat.java @@ -0,0 +1,39 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.storedfields; + +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; + +import java.io.IOException; + +abstract class FilterESStoredFieldsFormat extends ESStoredFieldsFormat { + private final StoredFieldsFormat delegate; + + FilterESStoredFieldsFormat(String name, StoredFieldsFormat delegate) { + super(name); + this.delegate = delegate; + } + + @Override + public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + return delegate.fieldsReader(directory, si, fn, context); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { + return delegate.fieldsWriter(directory, si, context); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormat.java new file mode 100644 index 0000000000000..cf3abf9ef52a8 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormat.java @@ -0,0 +1,342 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.storedfields; + +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.internal.hppc.IntObjectHashMap; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.util.Maps; +import org.elasticsearch.common.util.set.Sets; +import org.elasticsearch.core.IOUtils; +import org.elasticsearch.core.Nullable; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +/** + * Enables per field stored fields format support. + * + *

This class uses SPI to resolve format names.

+ * + *

Files written by each stored fields format should use different file fileExtensions, this is enforced during the writer creation.

+ */ +public abstract class PerFieldStoredFieldsFormat extends StoredFieldsFormat { + public static final String STORED_FIELD_FORMAT_ATTRIBUTE_KEY = "stored_field_format"; + + @Override + public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + return new PerFieldStoredFieldsReader(directory, si, fn, context); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, SegmentInfo si, IOContext context) throws IOException { + return new PerFieldStoredFieldsWriter(directory, si, context); + } + + protected abstract ESStoredFieldsFormat getStoredFieldsFormatForField(String field); + + class PerFieldStoredFieldsWriter extends StoredFieldsWriter { + private final IntObjectHashMap fields = new IntObjectHashMap<>(); + private final Map formatWriters = new HashMap<>(); + + private final Directory directory; + private final SegmentInfo si; + private final IOContext context; + + private int numStartedDocs = 0; + private int numFinishedDocs = 0; + + PerFieldStoredFieldsWriter(Directory directory, SegmentInfo si, IOContext context) { + this.directory = directory; + this.si = si; + this.context = context; + } + + @Override + public void startDocument() throws IOException { + for (var writerAndExtensions : formatWriters.values()) { + writerAndExtensions.writer().startDocument(); + } + numStartedDocs++; + } + + @Override + public void finishDocument() throws IOException { + for (var writerAndExtensions : formatWriters.values()) { + writerAndExtensions.writer().finishDocument(); + } + numFinishedDocs++; + } + + @Override + public void writeField(FieldInfo info, int value) throws IOException { + getWriterForField(info).writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, long value) throws IOException { + getWriterForField(info).writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, float value) throws IOException { + getWriterForField(info).writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, double value) throws IOException { + getWriterForField(info).writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, BytesRef value) throws IOException { + getWriterForField(info).writeField(info, value); + } + + @Override + public void writeField(FieldInfo info, String value) throws IOException { + getWriterForField(info).writeField(info, value); + } + + @Override + public void finish(int numDocs) throws IOException { + for (var writerAndExtensions : formatWriters.values()) { + writerAndExtensions.writer().finish(numDocs); + } + } + + @Override + public int merge(MergeState mergeState) throws IOException { + Map formatWriters = new HashMap<>(); + for (FieldInfo mergeFieldInfo : mergeState.mergeFieldInfos) { + var writerAndMetadata = getWriterAndMetadataForField(mergeFieldInfo); + formatWriters.put(writerAndMetadata.formatName(), writerAndMetadata.writer()); + } + + var totalDocs = 0; + for (Map.Entry formatNameAndWriter : formatWriters.entrySet()) { + final String writerFormatName = formatNameAndWriter.getKey(); + final StoredFieldsWriter formatWriter = formatNameAndWriter.getValue(); + StoredFieldsReader[] updatedReaders = new StoredFieldsReader[mergeState.storedFieldsReaders.length]; + for (int i = 0; i < mergeState.storedFieldsReaders.length; i++) { + final StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i]; + + // We need to unwrap the stored field readers belonging to a PerFieldStoredFieldsFormat, + // otherwise, downstream formats won't be able to perform certain optimizations when + // they try to merge segments as they expect an instance of the actual Reader in their checks + // (i.e. Lucene90CompressingStoredFieldsReader would do chunk merging for instances of the same class) + if (storedFieldsReader instanceof PerFieldStoredFieldsReader reader) { + final var formatStoredFieldsReader = reader.getFormatToStoredFieldReaders().get(writerFormatName); + // In case that we're dealing with a previous format, we just fall back to the slow path + updatedReaders[i] = Objects.requireNonNullElse(formatStoredFieldsReader, storedFieldsReader); + } else { + updatedReaders[i] = storedFieldsReader; + } + } + + var updatedMergeState = new MergeState( + mergeState.docMaps, + mergeState.segmentInfo, + mergeState.mergeFieldInfos, + updatedReaders, + mergeState.termVectorsReaders, + mergeState.normsProducers, + mergeState.docValuesProducers, + mergeState.fieldInfos, + mergeState.liveDocs, + mergeState.fieldsProducers, + mergeState.pointsReaders, + mergeState.knnVectorsReaders, + mergeState.maxDocs, + mergeState.infoStream, + mergeState.intraMergeTaskExecutor, + mergeState.needsIndexSort + ); + + totalDocs += formatWriter.merge(updatedMergeState); + } + return totalDocs; + } + + @Override + public void close() throws IOException { + IOUtils.close(formatWriters.values()); + } + + @Override + public long ramBytesUsed() { + long ramBytesUsed = 0; + for (var writer : formatWriters.values()) { + ramBytesUsed += writer.writer().ramBytesUsed(); + } + return ramBytesUsed; + } + + private StoredFieldsWriter getWriterForField(FieldInfo field) throws IOException { + return getWriterAndMetadataForField(field).writer; + } + + private StoredFieldsWriterAndMetadata getWriterAndMetadataForField(FieldInfo field) throws IOException { + var writer = fields.get(field.number); + if (writer != null) { + return writer; + } + + var format = getStoredFieldsFormatForField(field.name); + + if (format == null) { + throw new IllegalStateException("invalid null StoredFieldsFormat for field=\"" + field.name + "\""); + } + + var formatWriter = formatWriters.get(format); + if (formatWriter == null) { + for (StoredFieldsWriterAndMetadata value : formatWriters.values()) { + if (Sets.intersection(value.fileExtensions(), format.getFileExtensions()).isEmpty() == false) { + throw new IllegalStateException( + "File extension conflict for field '" + + field.name + + "': format " + + format.getName() + + " has overlapping fileExtensions with existing format" + ); + } + } + formatWriter = new StoredFieldsWriterAndMetadata( + format.getName(), + format.getFileExtensions(), + format.fieldsWriter(directory, si, context) + ); + + // Ensure that the doc count is consistent so when #finish is called + // all formats have a consistent doc count + for (int i = 0; i < numStartedDocs; i++) { + formatWriter.writer().startDocument(); + } + for (int i = 0; i < numFinishedDocs; i++) { + formatWriter.writer().finishDocument(); + } + + var previous = formatWriters.put(format, formatWriter); + assert previous == null; + } + fields.put(field.number, formatWriter); + field.putAttribute(STORED_FIELD_FORMAT_ATTRIBUTE_KEY, format.getName()); + + return formatWriter; + } + } + + record StoredFieldsWriterAndMetadata(String formatName, Set fileExtensions, StoredFieldsWriter writer) implements Closeable { + @Override + public void close() throws IOException { + writer.close(); + } + } + + public static class PerFieldStoredFieldsReader extends StoredFieldsReader { + private final Map formatToStoredFieldReaders; + private final Map fieldToFormat; + + PerFieldStoredFieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + HashMap formatStoredFieldReaders = new HashMap<>(); + HashMap fieldToFormat = new HashMap<>(); + boolean success = false; + try { + for (FieldInfo fi : fn) { + final String formatName = fi.getAttribute(STORED_FIELD_FORMAT_ATTRIBUTE_KEY); + // Can be a format name be null if we're reading a segment from this codec? + if (formatName != null) { + var storedFieldsReader = formatStoredFieldReaders.get(formatName); + if (storedFieldsReader == null) { + ESStoredFieldsFormat format = ESStoredFieldsFormat.forName(formatName); + storedFieldsReader = format.fieldsReader(directory, si, fn, context); + var previous = formatStoredFieldReaders.put(formatName, storedFieldsReader); + assert previous == null; + } + fieldToFormat.put(fi.name, formatName); + } + } + success = true; + } finally { + if (success == false) { + IOUtils.close(formatStoredFieldReaders.values()); + } + } + this.formatToStoredFieldReaders = Collections.unmodifiableMap(formatStoredFieldReaders); + this.fieldToFormat = Collections.unmodifiableMap(fieldToFormat); + } + + PerFieldStoredFieldsReader(Map formatToStoredFieldReaders, Map fieldToFormat) { + this.formatToStoredFieldReaders = Collections.unmodifiableMap(formatToStoredFieldReaders); + this.fieldToFormat = Collections.unmodifiableMap(fieldToFormat); + } + + @Override + public StoredFieldsReader clone() { + Map clonedFormats = Maps.newMapWithExpectedSize(formatToStoredFieldReaders.size()); + for (Map.Entry entry : formatToStoredFieldReaders.entrySet()) { + clonedFormats.put(entry.getKey(), entry.getValue().clone()); + } + return new PerFieldStoredFieldsReader(clonedFormats, fieldToFormat); + } + + @Override + public StoredFieldsReader getMergeInstance() { + Map mergeFormats = Maps.newMapWithExpectedSize(formatToStoredFieldReaders.size()); + for (Map.Entry entry : formatToStoredFieldReaders.entrySet()) { + mergeFormats.put(entry.getKey(), entry.getValue().getMergeInstance()); + } + return new PerFieldStoredFieldsReader(mergeFormats, fieldToFormat); + } + + @Override + public void checkIntegrity() throws IOException { + for (StoredFieldsReader storedFieldsReader : formatToStoredFieldReaders.values()) { + storedFieldsReader.checkIntegrity(); + } + } + + @Override + public void close() throws IOException { + IOUtils.close(formatToStoredFieldReaders.values()); + } + + @Override + public void document(int docID, StoredFieldVisitor visitor) throws IOException { + for (StoredFieldsReader storedFieldsReader : formatToStoredFieldReaders.values()) { + storedFieldsReader.document(docID, visitor); + } + } + + @Nullable + public StoredFieldsReader getReaderForField(String fieldName) { + String formatName = fieldToFormat.get(fieldName); + return formatName != null ? formatToStoredFieldReaders.get(formatName) : null; + } + + private Map getFormatToStoredFieldReaders() { + return formatToStoredFieldReaders; + } + } +} diff --git a/server/src/main/resources/META-INF/services/org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat b/server/src/main/resources/META-INF/services/org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat new file mode 100644 index 0000000000000..8687fd881249e --- /dev/null +++ b/server/src/main/resources/META-INF/services/org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat @@ -0,0 +1,3 @@ +org.elasticsearch.index.codec.storedfields.ESZstd814StoredFieldsFormat +org.elasticsearch.index.codec.storedfields.ESLucene90StoredFieldsFormat +org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat diff --git a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java index 17bd1c9c13675..d8fdd14ab38eb 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java @@ -10,12 +10,13 @@ package org.elasticsearch.index.codec.bloomfilter; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; +import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FilterMergePolicy; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -29,14 +30,15 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.codecs.asserting.AssertingCodec; -import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; -import org.apache.lucene.tests.util.TestUtil; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.UUIDs; -import org.elasticsearch.common.logging.LogConfigurator; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.codec.storedfields.ESLucene90StoredFieldsFormat; +import org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; +import org.elasticsearch.index.codec.storedfields.PerFieldStoredFieldsFormat; import org.elasticsearch.index.mapper.IdFieldMapper; +import org.elasticsearch.test.ESTestCase; import java.io.IOException; import java.nio.charset.StandardCharsets; @@ -49,40 +51,22 @@ import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.nullValue; -public class ES93BloomFilterStoredFieldsFormatTests extends BaseStoredFieldsFormatTestCase { - - static { - LogConfigurator.loadLog4jPlugins(); - LogConfigurator.configureESLogging(); // native access requires logging to be initialized - } - - @Override - protected Codec getCodec() { - return new AssertingCodec() { - @Override - public StoredFieldsFormat storedFieldsFormat() { - var bloomFilterSizeInKb = atLeast(2); - return new ES93BloomFilterStoredFieldsFormat( - BigArrays.NON_RECYCLING_INSTANCE, - "", - TestUtil.getDefaultCodec().storedFieldsFormat(), - ByteSizeValue.ofKb(bloomFilterSizeInKb), - IdFieldMapper.NAME - ); - } - }; - } - - @Override - protected void addRandomFields(Document doc) { - - } - +public class ES93BloomFilterStoredFieldsFormatTests extends ESTestCase { public void testBloomFilterFieldIsNotStoredAndBloomFilterCanBeChecked() throws IOException { try (var directory = newDirectory()) { Analyzer analyzer = new MockAnalyzer(random()); IndexWriterConfig conf = newIndexWriterConfig(analyzer); - conf.setCodec(getCodec()); + var bloomFilterSizeInKb = atLeast(2); + conf.setCodec( + new TestCodec( + IdFieldMapper.NAME, + new ES93BloomFilterStoredFieldsFormat( + BigArrays.NON_RECYCLING_INSTANCE, + ByteSizeValue.ofKb(bloomFilterSizeInKb), + IdFieldMapper.NAME + ) + ) + ); conf.setMergePolicy(newLogMergePolicy()); // We want to have at most 1 segment conf.setMaxBufferedDocs(200); @@ -112,8 +96,6 @@ public StoredFieldsFormat storedFieldsFormat() { var bloomFilterSizeInKb = atLeast(2); return new ES93BloomFilterStoredFieldsFormat( BigArrays.NON_RECYCLING_INSTANCE, - "", - TestUtil.getDefaultCodec().storedFieldsFormat(), ByteSizeValue.ofKb(bloomFilterSizeInKb), IdFieldMapper.NAME ) { @@ -132,6 +114,31 @@ int getBloomFilterSizeInBits() { }; } }); + var bloomFilterSizeInKb = atLeast(2); + conf.setCodec( + new TestCodec( + IdFieldMapper.NAME, + new ES93BloomFilterStoredFieldsFormat( + BigArrays.NON_RECYCLING_INSTANCE, + ByteSizeValue.ofKb(bloomFilterSizeInKb), + IdFieldMapper.NAME + ) { + @Override + int getBloomFilterSizeInBits() { + if (randomBloomFilterSizes) { + // Use different power of 2 values so we rebuild the bloom filter from the _id terms + var bloomFilterSizeInBytes = ByteSizeValue.ofKb(1).getBytes() << atLeast(5); + + return ES93BloomFilterStoredFieldsFormat.closestPowerOfTwoBloomFilterSizeInBits( + ByteSizeValue.ofBytes(bloomFilterSizeInBytes) + ); + } + return super.getBloomFilterSizeInBits(); + } + } + + ) + ); conf.setMergePolicy(new FilterMergePolicy(newLogMergePolicy()) { @Override public boolean useCompoundFile(SegmentInfos infos, SegmentCommitInfo mergedInfo, MergeContext mergeContext) { @@ -174,8 +181,7 @@ private static List indexDocs(IndexWriter writer) throws IOException { private void assertBloomFilterTestsPositiveForExistingDocs(IndexWriter writer, List indexedIds) throws IOException { try (var directoryReader = StandardDirectoryReader.open(writer)) { for (LeafReaderContext leaf : directoryReader.leaves()) { - try (ES93BloomFilterStoredFieldsFormat.BloomFilterProvider fieldReader = getBloomFilterProvider(leaf)) { - var bloomFilter = fieldReader.getBloomFilter(); + try (ES93BloomFilterStoredFieldsFormat.BloomFilter bloomFilter = getBloomFilterProvider(leaf)) { // the bloom filter reader is null only if the _id field is not stored during indexing assertThat(bloomFilter, is(not(nullValue()))); @@ -201,20 +207,65 @@ private void assertBloomFilterTestsPositiveForExistingDocs(IndexWriter writer, L } } - private static BytesRef getBytesRefFromString(String random) { - return new BytesRef(random.getBytes(StandardCharsets.UTF_8)); + private static BytesRef getBytesRefFromString(String string) { + return new BytesRef(string.getBytes(StandardCharsets.UTF_8)); } - private ES93BloomFilterStoredFieldsFormat.BloomFilterProvider getBloomFilterProvider(LeafReaderContext leafReaderContext) - throws IOException { + private ES93BloomFilterStoredFieldsFormat.BloomFilter getBloomFilterProvider(LeafReaderContext leafReaderContext) throws IOException { LeafReader reader = leafReaderContext.reader(); - var fieldInfos = reader.getFieldInfos(); + FieldInfos fieldInfos = reader.getFieldInfos(); assertThat(reader, is(instanceOf(SegmentReader.class))); SegmentReader segmentReader = (SegmentReader) reader; SegmentInfo si = segmentReader.getSegmentInfo().info; - var storedFieldsReader = si.getCodec().storedFieldsFormat().fieldsReader(si.dir, si, fieldInfos, IOContext.DEFAULT); - assertThat(storedFieldsReader, is(instanceOf(ES93BloomFilterStoredFieldsFormat.BloomFilterProvider.class))); - return ((ES93BloomFilterStoredFieldsFormat.BloomFilterProvider) storedFieldsReader); + StoredFieldsReader storedFieldsReader = si.getCodec().storedFieldsFormat().fieldsReader(si.dir, si, fieldInfos, IOContext.DEFAULT); + + assertThat(storedFieldsReader, is(instanceOf(PerFieldStoredFieldsFormat.PerFieldStoredFieldsReader.class))); + + PerFieldStoredFieldsFormat.PerFieldStoredFieldsReader perFieldStoredFieldsReader = + (PerFieldStoredFieldsFormat.PerFieldStoredFieldsReader) storedFieldsReader; + + StoredFieldsReader bloomFilterReader = perFieldStoredFieldsReader.getReaderForField(IdFieldMapper.NAME); + + assertThat(bloomFilterReader, is(instanceOf(ES93BloomFilterStoredFieldsFormat.BloomFilterProvider.class))); + ES93BloomFilterStoredFieldsFormat.BloomFilterProvider bloomFilterProvider = + (ES93BloomFilterStoredFieldsFormat.BloomFilterProvider) bloomFilterReader; + var bloomFilter = bloomFilterProvider.getBloomFilter(); + // Wrap the reader in a bloom filter so we can close it after we're done with it + return new ES93BloomFilterStoredFieldsFormat.BloomFilter() { + @Override + public boolean mayContainTerm(String field, BytesRef term) throws IOException { + return bloomFilter.mayContainTerm(field, term); + } + + @Override + public void close() throws IOException { + storedFieldsReader.close(); + } + }; + } + + static class TestCodec extends AssertingCodec { + private final String bloomFilterField; + private final ES93BloomFilterStoredFieldsFormat bloomFilterStoredFieldsFormat; + private final ESStoredFieldsFormat defaultStoredFieldsFormat = new ESLucene90StoredFieldsFormat(); + + TestCodec(String bloomFilterField, ES93BloomFilterStoredFieldsFormat bloomFilterStoredFieldsFormat) { + this.bloomFilterField = bloomFilterField; + this.bloomFilterStoredFieldsFormat = bloomFilterStoredFieldsFormat; + } + + @Override + public StoredFieldsFormat storedFieldsFormat() { + return new PerFieldStoredFieldsFormat() { + @Override + protected ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + if (field.equals(bloomFilterField)) { + return bloomFilterStoredFieldsFormat; + } + return defaultStoredFieldsFormat; + } + }; + } } } diff --git a/server/src/test/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormatTests.java new file mode 100644 index 0000000000000..3a208295edbf4 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormatTests.java @@ -0,0 +1,119 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.storedfields; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.LongField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.codecs.asserting.AssertingCodec; +import org.apache.lucene.tests.index.BaseStoredFieldsFormatTestCase; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.UUIDs; +import org.elasticsearch.common.logging.LogConfigurator; +import org.elasticsearch.common.unit.ByteSizeValue; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat; +import org.elasticsearch.index.mapper.IdFieldMapper; + +import java.nio.charset.StandardCharsets; +import java.util.Set; + +import static org.hamcrest.Matchers.equalTo; + +public class PerFieldStoredFieldsFormatTests extends BaseStoredFieldsFormatTestCase { + + static { + LogConfigurator.loadLog4jPlugins(); + LogConfigurator.configureESLogging(); // native access requires logging to be initialized + } + + @Override + protected Codec getCodec() { + var bloomFilterSizeInKb = atLeast(1); + var bloomFilterStoredFieldsFormat = new ES93BloomFilterStoredFieldsFormat( + BigArrays.NON_RECYCLING_INSTANCE, + ByteSizeValue.ofKb(bloomFilterSizeInKb), + IdFieldMapper.NAME + ); + var defaultStoredFields = new ESLucene90StoredFieldsFormat(); + + return new AssertingCodec() { + @Override + public StoredFieldsFormat storedFieldsFormat() { + return new PerFieldStoredFieldsFormat() { + @Override + protected ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + if (field.equals(IdFieldMapper.NAME)) { + return bloomFilterStoredFieldsFormat; + } + return defaultStoredFields; + } + }; + } + }; + } + + public void testConflictingFileExtensionsThrowAnException() throws Exception { + try (var directory = newDirectory()) { + Analyzer analyzer = new MockAnalyzer(random()); + IndexWriterConfig conf = newIndexWriterConfig(analyzer); + var bloomFilterStoredFieldsFormat = new ES93BloomFilterStoredFieldsFormat( + BigArrays.NON_RECYCLING_INSTANCE, + ByteSizeValue.ofKb(1), + IdFieldMapper.NAME + ); + + var defaultStoredFields = new ESLucene90StoredFieldsFormat() { + @Override + protected Set getFileExtensions() { + return Set.of(ES93BloomFilterStoredFieldsFormat.STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION); + } + }; + + conf.setCodec(new AssertingCodec() { + @Override + public StoredFieldsFormat storedFieldsFormat() { + return new PerFieldStoredFieldsFormat() { + @Override + protected ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + if (field.equals(IdFieldMapper.NAME)) { + return bloomFilterStoredFieldsFormat; + } + return defaultStoredFields; + } + }; + } + }); + conf.setMergePolicy(newLogMergePolicy()); + try (IndexWriter writer = new IndexWriter(directory, conf)) { + Document doc = new Document(); + var id = UUIDs.randomBase64UUID(); + doc.add(new StringField(IdFieldMapper.NAME, new BytesRef(id.getBytes(StandardCharsets.UTF_8)), Field.Store.YES)); + doc.add(new StringField("host", "host", Field.Store.YES)); + doc.add(new LongField("counter", 1, Field.Store.YES)); + var exception = expectThrows(IllegalStateException.class, () -> writer.addDocument(doc)); + assertThat( + exception.getMessage(), + equalTo( + "File extension conflict for field 'host': format ESLucene90StoredFieldsFormat " + + "has overlapping fileExtensions with existing format" + ) + ); + } + } + } +} From 83e22ee47ce316c2e53dd3d7335a35fee4a77479 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Wed, 19 Nov 2025 16:03:52 +0100 Subject: [PATCH 2/7] Update docs/changelog/138299.yaml --- docs/changelog/138299.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/138299.yaml diff --git a/docs/changelog/138299.yaml b/docs/changelog/138299.yaml new file mode 100644 index 0000000000000..8b2dad06c8eca --- /dev/null +++ b/docs/changelog/138299.yaml @@ -0,0 +1,5 @@ +pr: 138299 +summary: Add `PerFieldStoredFieldsFormat` to allow multiple stored field formats +area: Codec +type: enhancement +issues: [] From 914650a95f6f48978a9c0cea5555872299a738a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Thu, 20 Nov 2025 10:26:53 +0100 Subject: [PATCH 3/7] Remove the dependency to NamedSPI --- server/src/main/java/module-info.java | 7 ------ .../ES93BloomFilterStoredFieldsFormat.java | 22 +++++++------------ .../ESLucene90StoredFieldsFormat.java | 3 ++- .../storedfields/ESStoredFieldsFormat.java | 22 +------------------ .../ESZstd814StoredFieldsFormat.java | 3 ++- .../PerFieldStoredFieldsFormat.java | 14 +++++++++++- ...ex.codec.storedfields.ESStoredFieldsFormat | 3 --- 7 files changed, 26 insertions(+), 48 deletions(-) delete mode 100644 server/src/main/resources/META-INF/services/org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 33d512b046a99..032128b0f2609 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -469,13 +469,6 @@ org.elasticsearch.index.codec.vectors.es93.ES93BinaryQuantizedVectorsFormat, org.elasticsearch.index.codec.vectors.es93.ES93HnswVectorsFormat, org.elasticsearch.index.codec.vectors.es93.ES93HnswBinaryQuantizedVectorsFormat; - provides org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat - with - org.elasticsearch.index.codec.storedfields.ESZstd814StoredFieldsFormat, - org.elasticsearch.index.codec.storedfields.ESLucene90StoredFieldsFormat, - org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat; - - uses org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; provides org.apache.lucene.codecs.Codec with diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java index f547d6cd4b7c1..224619950baed 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -74,12 +74,12 @@ * */ public class ES93BloomFilterStoredFieldsFormat extends ESStoredFieldsFormat { - public static final String STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME = "ES93BloomFilterStoredFieldsFormat"; + public static final String FORMAT_NAME = "ES93BloomFilterStoredFieldsFormat"; public static final String STORED_FIELDS_BLOOM_FILTER_EXTENSION = "sfbf"; public static final String STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION = "sfbfm"; private static final Set FILE_EXTENSIONS = Set.of( STORED_FIELDS_METADATA_BLOOM_FILTER_EXTENSION, - STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME + STORED_FIELDS_BLOOM_FILTER_EXTENSION ); private static final int VERSION_START = 0; private static final int VERSION_CURRENT = VERSION_START; @@ -100,7 +100,7 @@ public class ES93BloomFilterStoredFieldsFormat extends ESStoredFieldsFormat { // Public constructor SPI use for reads only public ES93BloomFilterStoredFieldsFormat() { - super(STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME); + super(FORMAT_NAME); bigArrays = null; bloomFilterFieldName = null; numHashFunctions = 0; @@ -108,7 +108,7 @@ public ES93BloomFilterStoredFieldsFormat() { } public ES93BloomFilterStoredFieldsFormat(BigArrays bigArrays, ByteSizeValue bloomFilterSize, String bloomFilterFieldName) { - super(STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME); + super(FORMAT_NAME); this.bigArrays = bigArrays; this.bloomFilterFieldName = bloomFilterFieldName; this.numHashFunctions = DEFAULT_NUM_HASH_FUNCTIONS; @@ -196,13 +196,7 @@ static class Writer extends StoredFieldsWriter { try { metadataOut = directory.createOutput(bloomFilterMetadataFileName(segmentInfo), context); toClose.add(metadataOut); - CodecUtil.writeIndexHeader( - metadataOut, - STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, - VERSION_CURRENT, - segmentInfo.getId(), - DEFAULT_SEGMENT_SUFFIX - ); + CodecUtil.writeIndexHeader(metadataOut, FORMAT_NAME, VERSION_CURRENT, segmentInfo.getId(), DEFAULT_SEGMENT_SUFFIX); success = true; } finally { @@ -463,7 +457,7 @@ class BloomFilterWriter implements Closeable { try { CodecUtil.writeIndexHeader( bloomFilterDataOut, - STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, + FORMAT_NAME, VERSION_CURRENT, segmentInfo.getId(), DEFAULT_SEGMENT_SUFFIX @@ -629,7 +623,7 @@ static BloomFilterFieldReader open(Directory directory, SegmentInfo si, FieldInf try (var metaInput = directory.openChecksumInput(bloomFilterMetadataFileName(si))) { var metadataVersion = CodecUtil.checkIndexHeader( metaInput, - STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, + FORMAT_NAME, VERSION_START, VERSION_CURRENT, si.getId(), @@ -646,7 +640,7 @@ static BloomFilterFieldReader open(Directory directory, SegmentInfo si, FieldInf toClose.add(bloomFilterData); var bloomFilterDataVersion = CodecUtil.checkIndexHeader( bloomFilterData, - STORED_FIELDS_BLOOM_FILTER_FORMAT_NAME, + FORMAT_NAME, VERSION_START, VERSION_CURRENT, si.getId(), diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESLucene90StoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESLucene90StoredFieldsFormat.java index 2b639da2627ff..b82eedf369c09 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESLucene90StoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESLucene90StoredFieldsFormat.java @@ -24,6 +24,7 @@ public class ESLucene90StoredFieldsFormat extends FilterESStoredFieldsFormat { Lucene90CompressingStoredFieldsWriter.INDEX_EXTENSION, Lucene90CompressingStoredFieldsWriter.META_EXTENSION ); + public static final String FORMAT_NAME = "ESLucene90StoredFieldsFormat"; public ESLucene90StoredFieldsFormat() { this(Lucene103Codec.Mode.BEST_SPEED); @@ -31,7 +32,7 @@ public ESLucene90StoredFieldsFormat() { public ESLucene90StoredFieldsFormat(Lucene103Codec.Mode mode) { super( - "ESLucene90StoredFieldsFormat", + FORMAT_NAME, new Lucene90StoredFieldsFormat( mode == Lucene103Codec.Mode.BEST_COMPRESSION ? Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESStoredFieldsFormat.java index 369f6a9bb029e..654552ca34028 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESStoredFieldsFormat.java @@ -19,26 +19,7 @@ * This is required because {@link PerFieldStoredFieldsFormat} uses SPI to load stored field formats * when reading fields. */ -public abstract class ESStoredFieldsFormat extends StoredFieldsFormat implements NamedSPILoader.NamedSPI { - private static final class Holder { - public static final NamedSPILoader LOADER = new NamedSPILoader<>(ESStoredFieldsFormat.class); - - private Holder() {} - - static NamedSPILoader getLoader() { - if (LOADER == null) { - throw new IllegalStateException( - "You tried to lookup a ESStoredFieldsFormat by name before all formats could be initialized." - ); - } - return LOADER; - } - } - - public static ESStoredFieldsFormat forName(String name) { - return Holder.getLoader().lookup(name); - } - +public abstract class ESStoredFieldsFormat extends StoredFieldsFormat { /** * Unique name that's used to retrieve this format when reading the index. */ @@ -49,7 +30,6 @@ protected ESStoredFieldsFormat(String name) { this.name = name; } - @Override public String getName() { return name; } diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESZstd814StoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESZstd814StoredFieldsFormat.java index 084fc2d2c8f8b..dce8f0f89b80a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESZstd814StoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/ESZstd814StoredFieldsFormat.java @@ -23,13 +23,14 @@ public class ESZstd814StoredFieldsFormat extends FilterESStoredFieldsFormat { Lucene90CompressingStoredFieldsWriter.INDEX_EXTENSION, Lucene90CompressingStoredFieldsWriter.META_EXTENSION ); + public static final String FORMAT_NAME = "ESZstd814StoredFieldsFormat"; public ESZstd814StoredFieldsFormat() { this(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); } public ESZstd814StoredFieldsFormat(Zstd814StoredFieldsFormat.Mode mode) { - super("ESZstd814StoredFieldsFormat", mode.getFormat()); + super(FORMAT_NAME, mode.getFormat()); } @Override diff --git a/server/src/main/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormat.java index cf3abf9ef52a8..c2af091b1657c 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/storedfields/PerFieldStoredFieldsFormat.java @@ -25,6 +25,7 @@ import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.core.IOUtils; import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat; import java.io.Closeable; import java.io.IOException; @@ -44,6 +45,17 @@ public abstract class PerFieldStoredFieldsFormat extends StoredFieldsFormat { public static final String STORED_FIELD_FORMAT_ATTRIBUTE_KEY = "stored_field_format"; + // We don't need to add support for loading stored fields format through SPI since we control + // all the implementations, thus a simple static map is enough to load them on read time. + private static final Map AVAILABLE_FORMATS = Map.of( + ESLucene90StoredFieldsFormat.FORMAT_NAME, + new ESLucene90StoredFieldsFormat(), + ESZstd814StoredFieldsFormat.FORMAT_NAME, + new ESZstd814StoredFieldsFormat(), + ES93BloomFilterStoredFieldsFormat.FORMAT_NAME, + new ES93BloomFilterStoredFieldsFormat() + ); + @Override public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { return new PerFieldStoredFieldsReader(directory, si, fn, context); @@ -269,7 +281,7 @@ public static class PerFieldStoredFieldsReader extends StoredFieldsReader { if (formatName != null) { var storedFieldsReader = formatStoredFieldReaders.get(formatName); if (storedFieldsReader == null) { - ESStoredFieldsFormat format = ESStoredFieldsFormat.forName(formatName); + ESStoredFieldsFormat format = AVAILABLE_FORMATS.get(formatName); storedFieldsReader = format.fieldsReader(directory, si, fn, context); var previous = formatStoredFieldReaders.put(formatName, storedFieldsReader); assert previous == null; diff --git a/server/src/main/resources/META-INF/services/org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat b/server/src/main/resources/META-INF/services/org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat deleted file mode 100644 index 8687fd881249e..0000000000000 --- a/server/src/main/resources/META-INF/services/org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat +++ /dev/null @@ -1,3 +0,0 @@ -org.elasticsearch.index.codec.storedfields.ESZstd814StoredFieldsFormat -org.elasticsearch.index.codec.storedfields.ESLucene90StoredFieldsFormat -org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat From b651ee866bb41fce474ffc3c8cb04bcd6b353b50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Wed, 19 Nov 2025 15:58:05 +0100 Subject: [PATCH 4/7] Add new codecs --- server/src/main/java/module-info.java | 4 +- .../common/settings/IndexScopedSettings.java | 3 + .../elasticsearch/index/IndexSettings.java | 65 ++++++++ .../elasticsearch/index/IndexVersions.java | 1 + .../index/codec/CodecService.java | 15 +- ...icsearch93DefaultCompressionLucene103.java | 135 +++++++++++++++++ ...search93ZstdCompressionLucene103Codec.java | 143 ++++++++++++++++++ .../index/codec/PerFieldFormatSupplier.java | 37 +++++ ...PerFieldMapperCodecDefaultCompression.java | 54 +++++++ .../PerFieldMapperCodecZstdCompression.java | 58 +++++++ .../index/store/LuceneFilesExtensions.java | 4 +- .../services/org.apache.lucene.codecs.Codec | 2 + .../index/codec/PerFieldMapperCodecTests.java | 41 ++++- 13 files changed, 556 insertions(+), 6 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/codec/Elasticsearch93DefaultCompressionLucene103.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/Elasticsearch93ZstdCompressionLucene103Codec.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodecDefaultCompression.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodecZstdCompression.java diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 032128b0f2609..1221708cb6e1b 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -476,7 +476,9 @@ org.elasticsearch.index.codec.Elasticsearch816Codec, org.elasticsearch.index.codec.Elasticsearch900Codec, org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec, - org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec; + org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec, + org.elasticsearch.index.codec.Elasticsearch93DefaultCompressionLucene103, + org.elasticsearch.index.codec.Elasticsearch93ZstdCompressionLucene103Codec; provides org.apache.logging.log4j.core.util.ContextDataProvider with org.elasticsearch.common.logging.DynamicContextDataProvider; diff --git a/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java b/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java index 1b8f9cdcde489..7abcb9f381e72 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/IndexScopedSettings.java @@ -252,6 +252,9 @@ public final class IndexScopedSettings extends AbstractScopedSettings { if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { settings.add(IndexSettings.USE_SYNTHETIC_ID); } + if (IndexSettings.USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID_FEATURE_FLAG) { + settings.add(IndexSettings.USE_STORED_FIELD_BLOOM_FILTER_ID); + } settings.add(IndexSettings.INDEX_MAPPING_EXCLUDE_SOURCE_VECTORS_SETTING); BUILT_IN_INDEX_SETTINGS = Collections.unmodifiableSet(settings); }; diff --git a/server/src/main/java/org/elasticsearch/index/IndexSettings.java b/server/src/main/java/org/elasticsearch/index/IndexSettings.java index 994f8097399a8..703aecd073799 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSettings.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSettings.java @@ -735,6 +735,57 @@ public Iterator> settings() { Property.Final ); + public static final boolean USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID_FEATURE_FLAG = new FeatureFlag("stored_field_bloom_filter") + .isEnabled(); + public static final Setting USE_STORED_FIELD_BLOOM_FILTER_ID = Setting.boolSetting( + "index.mapping.use_stored_field_bloom_filter_id", + false, + new Setting.Validator<>() { + @Override + public void validate(Boolean enabled) { + if (enabled) { + if (USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID_FEATURE_FLAG == false) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "The setting [%s] is only permitted when the feature flag is enabled.", + USE_STORED_FIELD_BLOOM_FILTER_ID.getKey() + ) + ); + } + } + } + + @Override + public void validate(Boolean enabled, Map, Object> settings) { + if (enabled) { + // Verify if index mode is TIME_SERIES + var indexMode = (IndexMode) settings.get(MODE); + if (indexMode != IndexMode.TIME_SERIES) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "The setting [%s] is only permitted when [%s] is set to [%s]. Current mode: [%s].", + USE_STORED_FIELD_BLOOM_FILTER_ID.getKey(), + MODE.getKey(), + IndexMode.TIME_SERIES.name(), + indexMode.name() + ) + ); + } + } + } + + @Override + public Iterator> settings() { + List> list = List.of(MODE); + return list.iterator(); + } + }, + Property.IndexScope, + Property.Final + ); + /** * The {@link IndexMode "mode"} of the index. */ @@ -1020,6 +1071,7 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) { private final boolean useTimeSeriesSyntheticId; private final boolean useTimeSeriesDocValuesFormat; private final boolean useEs812PostingsFormat; + private final boolean useStoredFieldsBloomFilterForId; /** * The maximum number of refresh listeners allows on this shard. @@ -1230,6 +1282,12 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti } else { useTimeSeriesSyntheticId = false; } + useStoredFieldsBloomFilterForId = IndexSettings.USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID_FEATURE_FLAG + && scopedSettings.get(USE_STORED_FIELD_BLOOM_FILTER_ID); + if (useStoredFieldsBloomFilterForId) { + assert indexMetadata.getIndexMode() == IndexMode.TIME_SERIES : indexMetadata.getIndexMode(); + assert indexMetadata.getCreationVersion().onOrAfter(IndexVersions.TIME_SERIES_USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID); + } if (recoverySourceSyntheticEnabled) { if (DiscoveryNode.isStateless(settings)) { throw new IllegalArgumentException("synthetic recovery source is only allowed in stateful"); @@ -1969,6 +2027,13 @@ public boolean useTimeSeriesSyntheticId() { return useTimeSeriesSyntheticId; } + /** + * @return whether _id fields are stored as bloom filters in time-series indices for fast lookups. + */ + public boolean useStoredFieldsBloomFilterForId() { + return useStoredFieldsBloomFilterForId; + } + /** * @return Whether the time series doc value format should be used. */ diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index ef0cee140cf96..a80297b7f2c8c 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -194,6 +194,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion TIME_SERIES_USE_SYNTHETIC_ID = def(9_044_0_00, Version.LUCENE_10_3_1); public static final IndexVersion TIME_SERIES_DIMENSIONS_USE_SKIPPERS = def(9_045_0_00, Version.LUCENE_10_3_1); public static final IndexVersion TIME_SERIES_ALL_FIELDS_USE_SKIPPERS = def(9_046_0_00, Version.LUCENE_10_3_1); + public static final IndexVersion TIME_SERIES_USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID = def(9_047_0_00, Version.LUCENE_10_3_1); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java index 5d6e377d57db9..f96ea3e2a93f4 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -17,6 +17,7 @@ import org.elasticsearch.common.util.FeatureFlag; import org.elasticsearch.core.Nullable; import org.elasticsearch.index.IndexMode; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdCodec; import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; import org.elasticsearch.index.mapper.MapperService; @@ -49,7 +50,19 @@ public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) final var codecs = new HashMap(); Codec legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, mapperService, bigArrays); - if (ZSTD_STORED_FIELDS_FEATURE_FLAG) { + if (IndexSettings.USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID_FEATURE_FLAG) { + if (ZSTD_STORED_FIELDS_FEATURE_FLAG) { + codecs.put( + DEFAULT_CODEC, + new PerFieldMapperCodecZstdCompression(Zstd814StoredFieldsFormat.Mode.BEST_SPEED, mapperService, bigArrays) + ); + } else { + codecs.put( + DEFAULT_CODEC, + new PerFieldMapperCodecDefaultCompression(Lucene103Codec.Mode.BEST_SPEED, mapperService, bigArrays) + ); + } + } else if (ZSTD_STORED_FIELDS_FEATURE_FLAG) { codecs.put(DEFAULT_CODEC, new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED, mapperService, bigArrays)); } else { codecs.put(DEFAULT_CODEC, legacyBestSpeedCodec); diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch93DefaultCompressionLucene103.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch93DefaultCompressionLucene103.java new file mode 100644 index 0000000000000..e9cd8df5bdaf3 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch93DefaultCompressionLucene103.java @@ -0,0 +1,135 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +import org.elasticsearch.index.codec.storedfields.ESLucene90StoredFieldsFormat; +import org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; +import org.elasticsearch.index.codec.storedfields.PerFieldStoredFieldsFormat; + +public class Elasticsearch93DefaultCompressionLucene103 extends FilterCodec { + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Elasticsearch93DefaultCompressionLucene103.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Elasticsearch93DefaultCompressionLucene103.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Elasticsearch93DefaultCompressionLucene103.this.getKnnVectorsFormatForField(field); + } + }; + + protected final ESStoredFieldsFormat defaultStoredFieldsFormat; + private final StoredFieldsFormat storedFieldsFormat = new PerFieldStoredFieldsFormat() { + @Override + protected ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + return Elasticsearch93DefaultCompressionLucene103.this.getStoredFieldsFormatForField(field); + } + }; + + /** Public no-arg constructor, needed for SPI loading at read-time. */ + public Elasticsearch93DefaultCompressionLucene103() { + this(Lucene103Codec.Mode.BEST_SPEED); + } + + public Elasticsearch93DefaultCompressionLucene103(Lucene103Codec.Mode mode) { + super("Elasticsearch93Lucene103", new Lucene103Codec()); + this.defaultStoredFieldsFormat = new ESLucene90StoredFieldsFormat(mode); + this.defaultPostingsFormat = new Lucene103PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + // We need to return ES... for the SPI loading + public ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + return defaultStoredFieldsFormat; + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch93ZstdCompressionLucene103Codec.java b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch93ZstdCompressionLucene103Codec.java new file mode 100644 index 0000000000000..938a03e1a2c5a --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/Elasticsearch93ZstdCompressionLucene103Codec.java @@ -0,0 +1,143 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; +import org.elasticsearch.index.codec.perfield.XPerFieldDocValuesFormat; +import org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; +import org.elasticsearch.index.codec.storedfields.ESZstd814StoredFieldsFormat; +import org.elasticsearch.index.codec.storedfields.PerFieldStoredFieldsFormat; +import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; + +public class Elasticsearch93ZstdCompressionLucene103Codec extends CodecService.DeduplicateFieldInfosCodec { + + static final PostingsFormat DEFAULT_POSTINGS_FORMAT = new Lucene103PostingsFormat(); + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Elasticsearch93ZstdCompressionLucene103Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = new XPerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Elasticsearch93ZstdCompressionLucene103Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Elasticsearch93ZstdCompressionLucene103Codec.this.getKnnVectorsFormatForField(field); + } + }; + + // TODO: fix this + protected final ESStoredFieldsFormat defaultStoredFieldsFormat; + private final StoredFieldsFormat storedFieldsFormat = new PerFieldStoredFieldsFormat() { + @Override + protected ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + return Elasticsearch93ZstdCompressionLucene103Codec.this.getStoredFieldsFormatForField(field); + } + }; + + /** Public no-arg constructor, needed for SPI loading at read-time. */ + public Elasticsearch93ZstdCompressionLucene103Codec() { + this(Zstd814StoredFieldsFormat.Mode.BEST_SPEED); + } + + /** + * Constructor. Takes a {@link Zstd814StoredFieldsFormat.Mode} that describes whether to optimize for retrieval speed at the expense of + * worse space-efficiency or vice-versa. + */ + public Elasticsearch93ZstdCompressionLucene103Codec(Zstd814StoredFieldsFormat.Mode mode) { + super("Elasticsearch93ZstdStoredFieldsLucene103Codec", new Lucene103Codec()); + this.defaultStoredFieldsFormat = new ESZstd814StoredFieldsFormat(mode); + this.defaultPostingsFormat = DEFAULT_POSTINGS_FORMAT; + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

The default implementation always returns "Lucene912". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + // We need to return ES... for the SPI loading + public ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + return defaultStoredFieldsFormat; + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java index 43eb47e55324c..da3507bbca44f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldFormatSupplier.java @@ -15,11 +15,14 @@ import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.core.Nullable; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.codec.bloomfilter.ES87BloomFilterPostingsFormat; +import org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat; import org.elasticsearch.index.codec.postings.ES812PostingsFormat; +import org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; import org.elasticsearch.index.codec.tsdb.es819.ES819TSDBDocValuesFormat; import org.elasticsearch.index.mapper.CompletionFieldMapper; import org.elasticsearch.index.mapper.IdFieldMapper; @@ -34,6 +37,8 @@ import java.util.HashSet; import java.util.Set; +import static org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat.DEFAULT_BLOOM_FILTER_SIZE; + /** * Class that encapsulates the logic of figuring out the most appropriate file format for a given field, across postings, doc values and * vectors. @@ -66,10 +71,25 @@ public class PerFieldFormatSupplier { private final PostingsFormat defaultPostingsFormat; + private final ES93BloomFilterStoredFieldsFormat bloomFilterStoredFieldsFormat; + private final ESStoredFieldsFormat defaultStoredFieldsFormat; + public PerFieldFormatSupplier(MapperService mapperService, BigArrays bigArrays) { + this(mapperService, bigArrays, null); + } + + public PerFieldFormatSupplier( + MapperService mapperService, + BigArrays bigArrays, + @Nullable ESStoredFieldsFormat defaultStoredFieldsFormat + ) { this.mapperService = mapperService; this.bloomFilterPostingsFormat = new ES87BloomFilterPostingsFormat(bigArrays, this::internalGetPostingsFormatForField); this.defaultPostingsFormat = getDefaultPostingsFormat(mapperService); + this.bloomFilterStoredFieldsFormat = defaultStoredFieldsFormat == null + ? null + : new ES93BloomFilterStoredFieldsFormat(bigArrays, DEFAULT_BLOOM_FILTER_SIZE, IdFieldMapper.NAME); + this.defaultStoredFieldsFormat = defaultStoredFieldsFormat; } private static PostingsFormat getDefaultPostingsFormat(final MapperService mapperService) { @@ -140,6 +160,23 @@ public DocValuesFormat getDocValuesFormatForField(String field) { return docValuesFormat; } + public ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + if (defaultStoredFieldsFormat == null) { + throw new IllegalStateException("No default stored fields format available"); + } + + if (useStoredFieldsBloomFilter(field)) { + return bloomFilterStoredFieldsFormat; + } + return defaultStoredFieldsFormat; + } + + private boolean useStoredFieldsBloomFilter(String field) { + return field.equals(IdFieldMapper.NAME) + && mapperService != null + && mapperService.getIndexSettings().useStoredFieldsBloomFilterForId(); + } + boolean useTSDBDocValuesFormat(final String field) { if (excludeFields(field)) { return false; diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodecDefaultCompression.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodecDefaultCompression.java new file mode 100644 index 0000000000000..12a43ca7d6927 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodecDefaultCompression.java @@ -0,0 +1,54 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.codec.storedfields.ESLucene90StoredFieldsFormat; +import org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; +import org.elasticsearch.index.mapper.MapperService; + +public class PerFieldMapperCodecDefaultCompression extends Elasticsearch93DefaultCompressionLucene103 { + private final PerFieldFormatSupplier formatSupplier; + + public PerFieldMapperCodecDefaultCompression(Lucene103Codec.Mode compressionMode, MapperService mapperService, BigArrays bigArrays) { + super(compressionMode); + this.formatSupplier = new PerFieldFormatSupplier(mapperService, bigArrays, new ESLucene90StoredFieldsFormat(compressionMode)); + // If the below assertion fails, it is a sign that Lucene released a new codec. You must create a copy of the current Elasticsearch + // codec that delegates to this new Lucene codec, and make PerFieldMapperCodec extend this new Elasticsearch codec. + assert Codec.forName(Lucene.LATEST_CODEC).getClass() == getClass().getSuperclass() + : "LegacyPerFieldMapperCodec must be on the latest lucene codec: " + Lucene.LATEST_CODEC; + } + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return formatSupplier.getPostingsFormatForField(field); + } + + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return formatSupplier.getKnnVectorsFormatForField(field); + } + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return formatSupplier.getDocValuesFormatForField(field); + } + + @Override + public ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + return formatSupplier.getStoredFieldsFormatForField(field); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodecZstdCompression.java b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodecZstdCompression.java new file mode 100644 index 0000000000000..bb762f45f4657 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/PerFieldMapperCodecZstdCompression.java @@ -0,0 +1,58 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.common.util.BigArrays; +import org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; +import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; +import org.elasticsearch.index.mapper.MapperService; + +public class PerFieldMapperCodecZstdCompression extends Elasticsearch93ZstdCompressionLucene103Codec { + + private final PerFieldFormatSupplier formatSupplier; + + public PerFieldMapperCodecZstdCompression( + Zstd814StoredFieldsFormat.Mode compressionMode, + MapperService mapperService, + BigArrays bigArrays + ) { + super(compressionMode); + this.formatSupplier = new PerFieldFormatSupplier(mapperService, bigArrays, defaultStoredFieldsFormat); + // If the below assertion fails, it is a sign that Lucene released a new codec. You must create a copy of the current Elasticsearch + // codec that delegates to this new Lucene codec, and make PerFieldMapperCodec extend this new Elasticsearch codec. + assert Codec.forName(Lucene.LATEST_CODEC).getClass() == delegate.getClass() + : "PerFieldMapperCodec must be on the latest lucene codec: " + Lucene.LATEST_CODEC; + } + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return formatSupplier.getPostingsFormatForField(field); + } + + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return formatSupplier.getKnnVectorsFormatForField(field); + } + + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return formatSupplier.getDocValuesFormatForField(field); + } + + @Override + public ESStoredFieldsFormat getStoredFieldsFormatForField(String field) { + return formatSupplier.getStoredFieldsFormatForField(field); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/store/LuceneFilesExtensions.java b/server/src/main/java/org/elasticsearch/index/store/LuceneFilesExtensions.java index 16ea550095b68..1f37464721de1 100644 --- a/server/src/main/java/org/elasticsearch/index/store/LuceneFilesExtensions.java +++ b/server/src/main/java/org/elasticsearch/index/store/LuceneFilesExtensions.java @@ -89,7 +89,9 @@ public enum LuceneFilesExtensions { // ivf vectors format MIVF("mivf", "IVF Metadata", true, false), CENIVF("cenivf", "IVF Centroid Data", false, true), - CLIVF("clivf", "IVF Cluster Data", false, true); + CLIVF("clivf", "IVF Cluster Data", false, true), + SFBFM("sfbfm", "Stored field bloom filter metadata", true, false), + SFBF("sfbf", "Stored field bloom filter bitset", false, true); /** * Allow plugin developers of custom codecs to opt out of the assertion in {@link #fromExtension} diff --git a/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec b/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec index 971db6dcc032c..e2ca7c1b5a130 100644 --- a/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/server/src/main/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -3,3 +3,5 @@ org.elasticsearch.index.codec.Elasticsearch816Codec org.elasticsearch.index.codec.Elasticsearch900Codec org.elasticsearch.index.codec.Elasticsearch900Lucene101Codec org.elasticsearch.index.codec.Elasticsearch92Lucene103Codec +org.elasticsearch.index.codec.Elasticsearch93DefaultCompressionLucene103 +org.elasticsearch.index.codec.Elasticsearch93ZstdCompressionLucene103Codec diff --git a/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java b/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java index 7ae6275257c0b..6409aeb7b7885 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/PerFieldMapperCodecTests.java @@ -19,7 +19,11 @@ import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.MapperTestUtils; import org.elasticsearch.index.codec.bloomfilter.ES87BloomFilterPostingsFormat; +import org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat; import org.elasticsearch.index.codec.postings.ES812PostingsFormat; +import org.elasticsearch.index.codec.storedfields.ESLucene90StoredFieldsFormat; +import org.elasticsearch.index.codec.storedfields.ESStoredFieldsFormat; +import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.SeqNoFieldMapper; import org.elasticsearch.index.mapper.SourceFieldMapper; @@ -219,7 +223,7 @@ public void testUseTimeSeriesModeDisabledCodecDisabled() throws IOException { } public void testUseTimeSeriesDocValuesCodecSetting() throws IOException { - PerFieldFormatSupplier perFieldMapperCodec = createFormatSupplier(true, null, false, IndexMode.STANDARD, MAPPING_2); + PerFieldFormatSupplier perFieldMapperCodec = createFormatSupplier(true, null, false, null, IndexMode.STANDARD, MAPPING_2); assertThat((perFieldMapperCodec.useTSDBDocValuesFormat("@timestamp")), is(true)); assertThat((perFieldMapperCodec.useTSDBDocValuesFormat("counter")), is(true)); assertThat((perFieldMapperCodec.useTSDBDocValuesFormat("gauge")), is(true)); @@ -253,6 +257,31 @@ public void testSeqnoField() throws IOException { assertThat((perFieldMapperCodec.useTSDBDocValuesFormat(SeqNoFieldMapper.NAME)), is(true)); } + public void testUseStoredFieldBloomFilterForIdFieldOnTimeSeriesModeIfEnabled() throws IOException { + PerFieldFormatSupplier perFieldMapperCodec = createFormatSupplier(null, null, false, true, IndexMode.TIME_SERIES, MAPPING_2); + assertThat( + perFieldMapperCodec.getStoredFieldsFormatForField(IdFieldMapper.NAME), + is(instanceOf(ES93BloomFilterStoredFieldsFormat.class)) + ); + // For other fields, it uses the default one + assertThat(perFieldMapperCodec.getStoredFieldsFormatForField("hostname"), is(instanceOf(ESLucene90StoredFieldsFormat.class))); + } + + public void testUseDefaultStoredFieldsForIdFieldOnTimeSeriesModeIfDisabled() throws IOException { + PerFieldFormatSupplier perFieldMapperCodec = createFormatSupplier(null, null, false, false, IndexMode.TIME_SERIES, MAPPING_2); + assertThat( + perFieldMapperCodec.getStoredFieldsFormatForField(IdFieldMapper.NAME), + is(instanceOf(ESLucene90StoredFieldsFormat.class)) + ); + assertThat(perFieldMapperCodec.getStoredFieldsFormatForField("hostname"), is(instanceOf(ESLucene90StoredFieldsFormat.class))); + } + + public void testGetStoredFieldsFormatForFieldThrowsIfDefaultIsNotProvided() throws IOException { + PerFieldFormatSupplier perFieldMapperCodec = createFormatSupplier(IndexMode.TIME_SERIES, MAPPING_2); + expectThrows(IllegalStateException.class, () -> perFieldMapperCodec.getStoredFieldsFormatForField(IdFieldMapper.NAME)); + expectThrows(IllegalStateException.class, () -> perFieldMapperCodec.getStoredFieldsFormatForField("hostname")); + } + private PerFieldFormatSupplier createFormatSupplier(IndexMode mode, String mapping) throws IOException { return createFormatSupplier(null, false, mode, mapping); } @@ -263,13 +292,14 @@ private PerFieldFormatSupplier createFormatSupplier( IndexMode mode, String mapping ) throws IOException { - return createFormatSupplier(null, enableES87TSDBCodec, useEs812PostingsFormat, mode, mapping); + return createFormatSupplier(null, enableES87TSDBCodec, useEs812PostingsFormat, null, mode, mapping); } private PerFieldFormatSupplier createFormatSupplier( Boolean useTimeSeriesDocValuesFormatSetting, Boolean enableES87TSDBCodec, Boolean useEs812PostingsFormat, + Boolean enableIdStoredFieldBloomFilter, IndexMode mode, String mapping ) throws IOException { @@ -287,9 +317,14 @@ private PerFieldFormatSupplier createFormatSupplier( if (useEs812PostingsFormat) { settings.put(IndexSettings.USE_ES_812_POSTINGS_FORMAT.getKey(), true); } + ESStoredFieldsFormat defaultStoredFieldsFormat = null; + if (enableIdStoredFieldBloomFilter != null) { + settings.put(IndexSettings.USE_STORED_FIELD_BLOOM_FILTER_ID.getKey(), enableIdStoredFieldBloomFilter); + defaultStoredFieldsFormat = new ESLucene90StoredFieldsFormat(); + } MapperService mapperService = MapperTestUtils.newMapperService(xContentRegistry(), createTempDir(), settings.build(), "test"); mapperService.merge("type", new CompressedXContent(mapping), MapperService.MergeReason.MAPPING_UPDATE); - return new PerFieldFormatSupplier(mapperService, BigArrays.NON_RECYCLING_INSTANCE); + return new PerFieldFormatSupplier(mapperService, BigArrays.NON_RECYCLING_INSTANCE, defaultStoredFieldsFormat); } } From 25473eb74385bbecbe4119ba9d0be2bf6829f1a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Thu, 20 Nov 2025 10:11:37 +0100 Subject: [PATCH 5/7] Integrate bloom filter checks with TSDBSyntheticIdPostingsFormat --- .../datastreams/TSDBSyntheticIdsIT.java | 29 +++++-- .../index/codec/bloomfilter/BloomFilter.java | 71 ++++++++++++++++ .../BloomFilterFieldsProducer.java | 83 +++++++++++++++++++ .../bloomfilter/BloomFilterProvider.java | 20 +++++ .../ES87BloomFilterPostingsFormat.java | 65 +-------------- .../ES93BloomFilterStoredFieldsFormat.java | 29 ++----- .../bloomfilter/LazyFilterTermsEnum.java | 78 +++++++++++++++++ .../tsdb/TSDBSyntheticIdPostingsFormat.java | 7 +- ...S93BloomFilterStoredFieldsFormatTests.java | 17 ++-- 9 files changed, 301 insertions(+), 98 deletions(-) create mode 100644 server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilter.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterFieldsProducer.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterProvider.java create mode 100644 server/src/main/java/org/elasticsearch/index/codec/bloomfilter/LazyFilterTermsEnum.java diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index 654051b9e13f5..4ef7a3f6b286a 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -54,6 +54,7 @@ import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.notNullValue; +import static org.hamcrest.Matchers.nullValue; /** * Test suite for time series indices that use synthetic ids for documents. @@ -103,8 +104,10 @@ public void testInvalidIndexMode() { public void testSyntheticId() throws Exception { assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG); + assumeTrue("Test should only run with feature flag", IndexSettings.USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID_FEATURE_FLAG); final var dataStreamName = randomIdentifier(); - putDataStreamTemplate(dataStreamName, randomIntBetween(1, 5)); + final var enableStoredFieldsBloomFilter = randomBoolean(); + putDataStreamTemplate(dataStreamName, randomIntBetween(1, 5), enableStoredFieldsBloomFilter); final var docs = new HashMap(); final var unit = randomFrom(ChronoUnit.SECONDS, ChronoUnit.MINUTES); @@ -265,14 +268,21 @@ enum Operation { for (var index : indices) { var diskUsage = diskUsage(index); var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME); - assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); + // If the _id stored fields bloom filter is enabled, IndexDiskUsageStats won't account for anything since + // the bloom filter it's not exposed through the Reader API. + if (enableStoredFieldsBloomFilter) { + assertThat(diskUsageIdField, nullValue()); + } else { + assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); + } } } public void testGetFromTranslogBySyntheticId() throws Exception { assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG); final var dataStreamName = randomIdentifier(); - putDataStreamTemplate(dataStreamName, 1); + final var enableStoredFieldsBloomFilter = randomBoolean(); + putDataStreamTemplate(dataStreamName, 1, enableStoredFieldsBloomFilter); final var docs = new HashMap(); final var unit = randomFrom(ChronoUnit.SECONDS, ChronoUnit.MINUTES); @@ -376,7 +386,13 @@ public void testGetFromTranslogBySyntheticId() throws Exception { for (var index : indices) { var diskUsage = diskUsage(index); var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME); - assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); + // If the _id stored fields bloom filter is enabled, IndexDiskUsageStats won't account for anything since + // the bloom filter it's not exposed through the Reader API. + if (enableStoredFieldsBloomFilter) { + assertThat(diskUsageIdField, nullValue()); + } else { + assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); + } } assertHitCount(client().prepareSearch(dataStreamName).setSize(0), 10L); @@ -413,11 +429,12 @@ private static BulkItemResponse[] createDocuments(String indexName, XContentBuil return bulkResponse.getItems(); } - private static void putDataStreamTemplate(String indexPattern, int shards) throws IOException { + private static void putDataStreamTemplate(String indexPattern, int shards, boolean enableStoredFieldsBloomFilter) throws IOException { final var settings = indexSettings(shards, 0).put(IndexSettings.MODE.getKey(), IndexMode.TIME_SERIES.getName()) .put(IndexSettings.BLOOM_FILTER_ID_FIELD_ENABLED_SETTING.getKey(), false) .put(IndexSettings.INDEX_REFRESH_INTERVAL_SETTING.getKey(), -1) - .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), true); + .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), true) + .put(IndexSettings.USE_STORED_FIELD_BLOOM_FILTER_ID.getKey(), enableStoredFieldsBloomFilter); final var mappings = """ { diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilter.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilter.java new file mode 100644 index 0000000000000..5a011e7c8f86d --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilter.java @@ -0,0 +1,71 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.bloomfilter; + +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.codec.storedfields.PerFieldStoredFieldsFormat; + +import java.io.Closeable; +import java.io.IOException; + +public interface BloomFilter extends Closeable { + /** + * Tests whether the given term may exist in the specified field. + * + * @param field the field name to check + * @param term the term to test for membership + * @return true if term may be present, false if definitely absent + */ + boolean mayContainTerm(String field, BytesRef term) throws IOException; + + boolean isFilterAvailable(); + + @Nullable + static BloomFilter maybeGetBloomFilterForField(String field, SegmentReadState state) throws IOException { + var codec = state.segmentInfo.getCodec(); + StoredFieldsReader storedFieldsReader = codec.storedFieldsFormat() + .fieldsReader(state.directory, state.segmentInfo, state.fieldInfos, state.context); + + boolean success = false; + try { + if (storedFieldsReader instanceof PerFieldStoredFieldsFormat.PerFieldStoredFieldsReader perFieldStoredFieldsReader) { + StoredFieldsReader idStoredFieldsReader = perFieldStoredFieldsReader.getReaderForField(field); + if (idStoredFieldsReader instanceof BloomFilter bloomFilter && bloomFilter.isFilterAvailable()) { + success = true; + // We need to close the PerFieldStoredFieldsFormatReader otherwise we'll leak the reader for other fields + return new BloomFilter() { + @Override + public boolean mayContainTerm(String field, BytesRef term) throws IOException { + return bloomFilter.mayContainTerm(field, term); + } + + @Override + public boolean isFilterAvailable() { + return bloomFilter.isFilterAvailable(); + } + + @Override + public void close() throws IOException { + storedFieldsReader.close(); + } + }; + } + } + } finally { + if (success == false) { + storedFieldsReader.close(); + } + } + return null; + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterFieldsProducer.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterFieldsProducer.java new file mode 100644 index 0000000000000..c08cba09b9b6e --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterFieldsProducer.java @@ -0,0 +1,83 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.bloomfilter; + +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.index.FilterLeafReader; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.core.IOUtils; +import org.elasticsearch.index.mapper.IdFieldMapper; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Set; + +public class BloomFilterFieldsProducer extends FieldsProducer { + private static final Set FIELD_NAMES = Set.of(IdFieldMapper.NAME); + private final FieldsProducer delegate; + private final BloomFilter bloomFilter; + + public BloomFilterFieldsProducer(FieldsProducer delegate, BloomFilter bloomFilter) { + assert bloomFilter.isFilterAvailable(); + this.delegate = delegate; + this.bloomFilter = bloomFilter; + } + + @Override + public void close() throws IOException { + IOUtils.close(delegate, bloomFilter); + } + + @Override + public void checkIntegrity() throws IOException { + delegate.checkIntegrity(); + } + + @Override + public Iterator iterator() { + return delegate.iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + assert FIELD_NAMES.contains(field) : "Expected one of " + FIELD_NAMES + " but got " + field; + return new FilterLeafReader.FilterTerms(delegate.terms(field)) { + @Override + public TermsEnum iterator() throws IOException { + return new LazyFilterTermsEnum() { + private TermsEnum delegate; + + @Override + protected TermsEnum getDelegate() throws IOException { + if (delegate == null) { + delegate = in.iterator(); + } + return delegate; + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + if (bloomFilter.mayContainTerm(field, text) == false) { + return false; + } + return getDelegate().seekExact(text); + } + }; + } + }; + } + + @Override + public int size() { + return delegate.size(); + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterProvider.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterProvider.java new file mode 100644 index 0000000000000..71653e9d92bbc --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterProvider.java @@ -0,0 +1,20 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.bloomfilter; + +import org.elasticsearch.core.Nullable; + +import java.io.Closeable; +import java.io.IOException; + +public interface BloomFilterProvider extends Closeable { + @Nullable + BloomFilter getBloomFilter() throws IOException; +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java index 65d2c1317b86e..ad454b6b73e1f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES87BloomFilterPostingsFormat.java @@ -24,13 +24,10 @@ import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; import org.apache.lucene.index.FilterLeafReader; -import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -41,7 +38,6 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.RandomAccessInput; -import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.lucene.store.IndexOutputOutputStream; @@ -51,7 +47,6 @@ import java.io.Closeable; import java.io.IOException; -import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -437,7 +432,7 @@ public TermsEnum iterator() throws IOException { private TermsEnum delegate; @Override - TermsEnum getDelegate() throws IOException { + protected TermsEnum getDelegate() throws IOException { if (delegate == null) { delegate = in.iterator(); } @@ -467,64 +462,6 @@ public TermState termState() throws IOException { } } - private abstract static class LazyFilterTermsEnum extends BaseTermsEnum { - abstract TermsEnum getDelegate() throws IOException; - - @Override - public SeekStatus seekCeil(BytesRef text) throws IOException { - return getDelegate().seekCeil(text); - } - - @Override - public void seekExact(long ord) throws IOException { - getDelegate().seekExact(ord); - } - - @Override - public BytesRef term() throws IOException { - return getDelegate().term(); - } - - @Override - public long ord() throws IOException { - return getDelegate().ord(); - } - - @Override - public int docFreq() throws IOException { - return getDelegate().docFreq(); - } - - @Override - public long totalTermFreq() throws IOException { - return getDelegate().totalTermFreq(); - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { - return getDelegate().postings(reuse, flags); - } - - @Override - public ImpactsEnum impacts(int flags) throws IOException { - return getDelegate().impacts(flags); - } - - @Override - public BytesRef next() throws IOException { - return getDelegate().next(); - } - - @Override - public AttributeSource attributes() { - try { - return getDelegate().attributes(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - static int bloomFilterSize(int maxDocs) { if (maxDocs < 1) { throw new IllegalStateException("maxDocs must be greater than or equal to 1, got " + maxDocs); diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java index 224619950baed..f6122b4d7a96b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormat.java @@ -547,7 +547,7 @@ public void close() throws IOException { } } - private static class Reader extends StoredFieldsReader implements BloomFilterProvider { + private static class Reader extends StoredFieldsReader implements BloomFilter { @Nullable private final BloomFilterFieldReader bloomFilterFieldReader; @@ -578,8 +578,13 @@ public void document(int docID, StoredFieldVisitor visitor) throws IOException { } @Override - public BloomFilter getBloomFilter() throws IOException { - return bloomFilterFieldReader; + public boolean mayContainTerm(String field, BytesRef term) throws IOException { + return bloomFilterFieldReader == null || bloomFilterFieldReader.mayContainTerm(field, term); + } + + @Override + public boolean isFilterAvailable() { + return bloomFilterFieldReader != null; } } @@ -609,7 +614,7 @@ static BloomFilterMetadata readFrom(IndexInput in, FieldInfos fieldInfos) throws } } - static class BloomFilterFieldReader implements BloomFilter { + static class BloomFilterFieldReader implements Closeable { private final FieldInfo fieldInfo; private final IndexInput bloomFilterData; private final RandomAccessInput bloomFilterIn; @@ -740,20 +745,4 @@ private static String bloomFilterMetadataFileName(SegmentInfo segmentInfo) { private static String bloomFilterFileName(SegmentInfo segmentInfo) { return IndexFileNames.segmentFileName(segmentInfo.name, DEFAULT_SEGMENT_SUFFIX, STORED_FIELDS_BLOOM_FILTER_EXTENSION); } - - public interface BloomFilter extends Closeable { - /** - * Tests whether the given term may exist in the specified field. - * - * @param field the field name to check - * @param term the term to test for membership - * @return true if term may be present, false if definitely absent - */ - boolean mayContainTerm(String field, BytesRef term) throws IOException; - } - - public interface BloomFilterProvider extends Closeable { - @Nullable - BloomFilter getBloomFilter() throws IOException; - } } diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/LazyFilterTermsEnum.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/LazyFilterTermsEnum.java new file mode 100644 index 0000000000000..b8baf48037177 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/LazyFilterTermsEnum.java @@ -0,0 +1,78 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.index.codec.bloomfilter; + +import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; + +import java.io.IOException; +import java.io.UncheckedIOException; + +public abstract class LazyFilterTermsEnum extends BaseTermsEnum { + protected abstract TermsEnum getDelegate() throws IOException; + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + return getDelegate().seekCeil(text); + } + + @Override + public void seekExact(long ord) throws IOException { + getDelegate().seekExact(ord); + } + + @Override + public BytesRef term() throws IOException { + return getDelegate().term(); + } + + @Override + public long ord() throws IOException { + return getDelegate().ord(); + } + + @Override + public int docFreq() throws IOException { + return getDelegate().docFreq(); + } + + @Override + public long totalTermFreq() throws IOException { + return getDelegate().totalTermFreq(); + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + return getDelegate().postings(reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + return getDelegate().impacts(flags); + } + + @Override + public BytesRef next() throws IOException { + return getDelegate().next(); + } + + @Override + public AttributeSource attributes() { + try { + return getDelegate().attributes(); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } +} diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdPostingsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdPostingsFormat.java index 66a6aa7151c6b..7eb35fc4427bb 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdPostingsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdPostingsFormat.java @@ -16,7 +16,10 @@ import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.elasticsearch.core.IOUtils; +import org.elasticsearch.index.codec.bloomfilter.BloomFilter; +import org.elasticsearch.index.codec.bloomfilter.BloomFilterFieldsProducer; import org.elasticsearch.index.mapper.DataStreamTimestampFieldMapper; +import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.mapper.SyntheticIdField; import org.elasticsearch.index.mapper.TimeSeriesIdFieldMapper; import org.elasticsearch.index.mapper.TimeSeriesRoutingHashFieldMapper; @@ -43,11 +46,13 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException boolean success = false; try { var codec = state.segmentInfo.getCodec(); + BloomFilter bloomFilter = BloomFilter.maybeGetBloomFilterForField(IdFieldMapper.NAME, state); + // Erase the segment suffix (used only for reading postings) docValuesProducer = codec.docValuesFormat().fieldsProducer(new SegmentReadState(state, "")); var fieldsProducer = new TSDBSyntheticIdFieldsProducer(state, docValuesProducer); success = true; - return fieldsProducer; + return bloomFilter == null ? fieldsProducer : new BloomFilterFieldsProducer(fieldsProducer, bloomFilter); } finally { if (success == false) { IOUtils.close(docValuesProducer); diff --git a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java index d8fdd14ab38eb..8113e574de898 100644 --- a/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java +++ b/server/src/test/java/org/elasticsearch/index/codec/bloomfilter/ES93BloomFilterStoredFieldsFormatTests.java @@ -181,7 +181,7 @@ private static List indexDocs(IndexWriter writer) throws IOException { private void assertBloomFilterTestsPositiveForExistingDocs(IndexWriter writer, List indexedIds) throws IOException { try (var directoryReader = StandardDirectoryReader.open(writer)) { for (LeafReaderContext leaf : directoryReader.leaves()) { - try (ES93BloomFilterStoredFieldsFormat.BloomFilter bloomFilter = getBloomFilterProvider(leaf)) { + try (BloomFilter bloomFilter = getBloomFilter(leaf)) { // the bloom filter reader is null only if the _id field is not stored during indexing assertThat(bloomFilter, is(not(nullValue()))); @@ -211,7 +211,7 @@ private static BytesRef getBytesRefFromString(String string) { return new BytesRef(string.getBytes(StandardCharsets.UTF_8)); } - private ES93BloomFilterStoredFieldsFormat.BloomFilter getBloomFilterProvider(LeafReaderContext leafReaderContext) throws IOException { + private BloomFilter getBloomFilter(LeafReaderContext leafReaderContext) throws IOException { LeafReader reader = leafReaderContext.reader(); FieldInfos fieldInfos = reader.getFieldInfos(); assertThat(reader, is(instanceOf(SegmentReader.class))); @@ -227,17 +227,20 @@ private ES93BloomFilterStoredFieldsFormat.BloomFilter getBloomFilterProvider(Lea StoredFieldsReader bloomFilterReader = perFieldStoredFieldsReader.getReaderForField(IdFieldMapper.NAME); - assertThat(bloomFilterReader, is(instanceOf(ES93BloomFilterStoredFieldsFormat.BloomFilterProvider.class))); - ES93BloomFilterStoredFieldsFormat.BloomFilterProvider bloomFilterProvider = - (ES93BloomFilterStoredFieldsFormat.BloomFilterProvider) bloomFilterReader; - var bloomFilter = bloomFilterProvider.getBloomFilter(); + assertThat(bloomFilterReader, is(instanceOf(BloomFilter.class))); + BloomFilter bloomFilter = (BloomFilter) bloomFilterReader; // Wrap the reader in a bloom filter so we can close it after we're done with it - return new ES93BloomFilterStoredFieldsFormat.BloomFilter() { + return new BloomFilter() { @Override public boolean mayContainTerm(String field, BytesRef term) throws IOException { return bloomFilter.mayContainTerm(field, term); } + @Override + public boolean isFilterAvailable() { + return bloomFilter.isFilterAvailable(); + } + @Override public void close() throws IOException { storedFieldsReader.close(); From 19a699271902c98d6624e06818b3e03c4d61b752 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Thu, 20 Nov 2025 12:25:51 +0100 Subject: [PATCH 6/7] Update docs/changelog/138357.yaml --- docs/changelog/138357.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/138357.yaml diff --git a/docs/changelog/138357.yaml b/docs/changelog/138357.yaml new file mode 100644 index 0000000000000..e8f3f23c47983 --- /dev/null +++ b/docs/changelog/138357.yaml @@ -0,0 +1,5 @@ +pr: 138357 +summary: Integrate bloom filter checks with TSDBSyntheticIdPostingsFormat +area: Codec +type: enhancement +issues: [] From 0c7a397008b420544d2b257357bbaeeb1bb50ee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Fern=C3=A1ndez=20Casta=C3=B1o?= Date: Thu, 20 Nov 2025 12:28:37 +0100 Subject: [PATCH 7/7] Remove unused interface --- .../bloomfilter/BloomFilterProvider.java | 20 ------------------- 1 file changed, 20 deletions(-) delete mode 100644 server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterProvider.java diff --git a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterProvider.java b/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterProvider.java deleted file mode 100644 index 71653e9d92bbc..0000000000000 --- a/server/src/main/java/org/elasticsearch/index/codec/bloomfilter/BloomFilterProvider.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one - * or more contributor license agreements. Licensed under the "Elastic License - * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side - * Public License v 1"; you may not use this file except in compliance with, at - * your election, the "Elastic License 2.0", the "GNU Affero General Public - * License v3.0 only", or the "Server Side Public License, v 1". - */ - -package org.elasticsearch.index.codec.bloomfilter; - -import org.elasticsearch.core.Nullable; - -import java.io.Closeable; -import java.io.IOException; - -public interface BloomFilterProvider extends Closeable { - @Nullable - BloomFilter getBloomFilter() throws IOException; -}