-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Integrate stored fields format bloom filter with synthetic _id #138515
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
429c14a
38984b6
24fc7e8
60b1eb7
7f6d6f3
a72a66a
9d0a1f8
799fb3a
089f6f5
d0d94ef
e01019b
e1cbce6
680edf1
3e632b8
279cff3
acc82c1
cfec7f9
92a6daa
ad711c3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| pr: 138515 | ||
| summary: Integrate stored fields format bloom filter with synthetic `_id` | ||
| area: Codec | ||
| type: enhancement | ||
| issues: [] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,8 +16,7 @@ | |
| import org.elasticsearch.common.util.BigArrays; | ||
| import org.elasticsearch.common.util.FeatureFlag; | ||
| import org.elasticsearch.core.Nullable; | ||
| import org.elasticsearch.index.IndexMode; | ||
| import org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdCodec; | ||
| import org.elasticsearch.index.IndexVersions; | ||
| import org.elasticsearch.index.codec.zstd.Zstd814StoredFieldsFormat; | ||
| import org.elasticsearch.index.mapper.MapperService; | ||
|
|
||
|
|
@@ -48,27 +47,56 @@ public class CodecService implements CodecProvider { | |
| public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) { | ||
| final var codecs = new HashMap<String, Codec>(); | ||
|
|
||
| Codec legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, mapperService, bigArrays); | ||
| boolean useSyntheticId = mapperService != null | ||
| && mapperService.getIndexSettings().useTimeSeriesSyntheticId() | ||
| && mapperService.getIndexSettings() | ||
| .getIndexVersionCreated() | ||
| .onOrAfter(IndexVersions.TIME_SERIES_USE_STORED_FIELDS_BLOOM_FILTER_FOR_ID); | ||
|
|
||
| var legacyBestSpeedCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_SPEED, mapperService, bigArrays); | ||
| if (ZSTD_STORED_FIELDS_FEATURE_FLAG) { | ||
| codecs.put(DEFAULT_CODEC, new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_SPEED, mapperService, bigArrays)); | ||
| PerFieldMapperCodec defaultZstdCodec = new PerFieldMapperCodec( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we want to reduce the scope of this change, we could create our own default_code_with_synthetic_id and hard-coded this in INDEX_CODEC_SETTING for all time-series with use_synthetic_id enabled. Here we go for the complete solution immediately, for which I'm ok too.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't have a strong opinion about this. I'm ok with both approaches. The downside of an extra codec is that we need to maintain it indefinitely whereas with this change as long as the feature flag is off we keep the current behaviour. |
||
| Zstd814StoredFieldsFormat.Mode.BEST_SPEED, | ||
| mapperService, | ||
| bigArrays | ||
| ); | ||
| codecs.put( | ||
| DEFAULT_CODEC, | ||
| useSyntheticId ? new ES93TSDBZSTDCompressionLucene103Codec(defaultZstdCodec, bigArrays) : defaultZstdCodec | ||
| ); | ||
| } else { | ||
| codecs.put(DEFAULT_CODEC, legacyBestSpeedCodec); | ||
| codecs.put( | ||
| DEFAULT_CODEC, | ||
| useSyntheticId ? new ES93TSDBDefaultCompressionLucene103Codec(legacyBestSpeedCodec, bigArrays) : legacyBestSpeedCodec | ||
| ); | ||
| } | ||
| codecs.put(LEGACY_DEFAULT_CODEC, legacyBestSpeedCodec); | ||
|
|
||
| codecs.put( | ||
| LEGACY_DEFAULT_CODEC, | ||
| useSyntheticId ? new ES93TSDBDefaultCompressionLucene103Codec(legacyBestSpeedCodec, bigArrays) : legacyBestSpeedCodec | ||
| ); | ||
|
|
||
| var bestCompressionCodec = new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION, mapperService, bigArrays); | ||
| codecs.put( | ||
| BEST_COMPRESSION_CODEC, | ||
| new PerFieldMapperCodec(Zstd814StoredFieldsFormat.Mode.BEST_COMPRESSION, mapperService, bigArrays) | ||
| useSyntheticId ? new ES93TSDBZSTDCompressionLucene103Codec(bestCompressionCodec, bigArrays) : bestCompressionCodec | ||
| ); | ||
|
|
||
| var legacyBestCompressionCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_COMPRESSION, mapperService, bigArrays); | ||
| codecs.put( | ||
| LEGACY_BEST_COMPRESSION_CODEC, | ||
| useSyntheticId | ||
| ? new ES93TSDBDefaultCompressionLucene103Codec(legacyBestCompressionCodec, bigArrays) | ||
| : legacyBestCompressionCodec | ||
| ); | ||
| Codec legacyBestCompressionCodec = new LegacyPerFieldMapperCodec(Lucene103Codec.Mode.BEST_COMPRESSION, mapperService, bigArrays); | ||
| codecs.put(LEGACY_BEST_COMPRESSION_CODEC, legacyBestCompressionCodec); | ||
|
|
||
| codecs.put(LUCENE_DEFAULT_CODEC, Codec.getDefault()); | ||
| codecs.put( | ||
| LUCENE_DEFAULT_CODEC, | ||
| useSyntheticId ? new ES93TSDBLuceneDefaultCodec(Codec.getDefault(), bigArrays) : Codec.getDefault() | ||
| ); | ||
| for (String codec : Codec.availableCodecs()) { | ||
| codecs.put(codec, Codec.forName(codec)); | ||
| } | ||
| final boolean useTsdbSyntheticId = mapperService != null && mapperService.getIndexSettings().useTimeSeriesSyntheticId(); | ||
| assert useTsdbSyntheticId == false || mapperService.getIndexSettings().getMode() == IndexMode.TIME_SERIES; | ||
|
|
||
| this.codecs = codecs.entrySet().stream().collect(Collectors.toUnmodifiableMap(Map.Entry::getKey, e -> { | ||
| Codec codec; | ||
|
|
@@ -77,9 +105,6 @@ public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) | |
| } else { | ||
| codec = new DeduplicateFieldInfosCodec(e.getValue().getName(), e.getValue()); | ||
| } | ||
| if (useTsdbSyntheticId && codec instanceof TSDBSyntheticIdCodec == false) { | ||
| codec = new TSDBSyntheticIdCodec(codec.getName(), codec); | ||
| } | ||
| return codec; | ||
| })); | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.index.codec; | ||
|
|
||
| import org.apache.lucene.codecs.lucene103.Lucene103Codec; | ||
| import org.elasticsearch.common.util.BigArrays; | ||
|
|
||
| public class ES93TSDBDefaultCompressionLucene103Codec extends TSDBCodecWithSyntheticId { | ||
| /** Public no-arg constructor, needed for SPI loading at read-time. */ | ||
| public ES93TSDBDefaultCompressionLucene103Codec() { | ||
| this(new Lucene103Codec(), null); | ||
| } | ||
|
|
||
| ES93TSDBDefaultCompressionLucene103Codec(Lucene103Codec delegate, BigArrays bigArrays) { | ||
| super("ES93TSDBDefaultCompressionLucene103Codec", delegate, bigArrays); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.index.codec; | ||
|
|
||
| import org.apache.lucene.codecs.Codec; | ||
| import org.apache.lucene.codecs.lucene103.Lucene103Codec; | ||
| import org.elasticsearch.common.util.BigArrays; | ||
|
|
||
| public class ES93TSDBLuceneDefaultCodec extends TSDBCodecWithSyntheticId { | ||
| /** Public no-arg constructor, needed for SPI loading at read-time. */ | ||
| public ES93TSDBLuceneDefaultCodec() { | ||
| this(new Lucene103Codec(), null); | ||
| } | ||
|
|
||
| ES93TSDBLuceneDefaultCodec(Codec delegate, BigArrays bigArrays) { | ||
| super("ES93TSDBLuceneDefaultCodec", delegate, bigArrays); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.index.codec; | ||
|
|
||
| import org.elasticsearch.common.util.BigArrays; | ||
|
|
||
| public class ES93TSDBZSTDCompressionLucene103Codec extends TSDBCodecWithSyntheticId { | ||
| /** Public no-arg constructor, needed for SPI loading at read-time. */ | ||
| public ES93TSDBZSTDCompressionLucene103Codec() { | ||
| this(new Elasticsearch92Lucene103Codec(), null); | ||
| } | ||
|
|
||
| ES93TSDBZSTDCompressionLucene103Codec(Elasticsearch92Lucene103Codec delegate, BigArrays bigArrays) { | ||
| super("ES93TSDBZSTDCompressionLucene103Codec", delegate, bigArrays); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,63 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.index.codec; | ||
|
|
||
| import org.apache.lucene.codecs.Codec; | ||
| import org.apache.lucene.codecs.FilterCodec; | ||
| import org.apache.lucene.codecs.StoredFieldsFormat; | ||
| import org.elasticsearch.common.util.BigArrays; | ||
| import org.elasticsearch.index.codec.bloomfilter.ES93BloomFilterStoredFieldsFormat; | ||
| import org.elasticsearch.index.codec.storedfields.TSDBStoredFieldsFormat; | ||
| import org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdCodec; | ||
| import org.elasticsearch.index.mapper.IdFieldMapper; | ||
|
|
||
| /** | ||
| * Abstract base class for ES codecs used with time-series ({@code TIME_SERIES}) indices | ||
| * that employ synthetic document IDs for storage optimization. | ||
| * | ||
| * <p>This class configures the codec to use the following formats: | ||
| * <ul> | ||
| * <li> | ||
| * Use {@link TSDBSyntheticIdCodec} as the underlying codec for synthesizing the `_id` field from | ||
| * the values of other fields of the document (ex: _tsid, @timestamp, etc.) so that no inverted index | ||
| * or stored field are required for the `_id`. As such, looking up documents by `_id` might be very | ||
| * slow and that's why it is used along with a Bloom filter. | ||
| * </li> | ||
| * <li> | ||
| * Apply {@link TSDBStoredFieldsFormat} with bloom filter optimization for efficient ID lookups | ||
| * </li> | ||
| * </ul> | ||
| * | ||
| * <p>Synthetic IDs in TSDB indices are generated from the document's dimensions and timestamp, | ||
| * replacing the standard {@code _id} field to reduce storage overhead. | ||
| * | ||
| * @see TSDBSyntheticIdCodec | ||
| * @see TSDBStoredFieldsFormat | ||
| */ | ||
| abstract class TSDBCodecWithSyntheticId extends FilterCodec { | ||
| private final TSDBStoredFieldsFormat storedFieldsFormat; | ||
|
|
||
| TSDBCodecWithSyntheticId(String name, Codec delegate, BigArrays bigArrays) { | ||
| super(name, new TSDBSyntheticIdCodec(delegate)); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can merge TSDBSyntheticIdCodec and TSDBCodecWithSyntheticId together in a follow up.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, I just saw your #138515 (comment) 👍
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm planning to incorporate the code from |
||
| this.storedFieldsFormat = new TSDBStoredFieldsFormat( | ||
| delegate.storedFieldsFormat(), | ||
| new ES93BloomFilterStoredFieldsFormat( | ||
| bigArrays, | ||
| ES93BloomFilterStoredFieldsFormat.DEFAULT_BLOOM_FILTER_SIZE, | ||
| IdFieldMapper.NAME | ||
| ) | ||
| ); | ||
| } | ||
|
|
||
| @Override | ||
| public StoredFieldsFormat storedFieldsFormat() { | ||
| return storedFieldsFormat; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.index.codec.bloomfilter; | ||
|
|
||
| import org.apache.lucene.util.BytesRef; | ||
|
|
||
| import java.io.Closeable; | ||
| import java.io.IOException; | ||
|
|
||
| public interface BloomFilter extends Closeable { | ||
| /** | ||
| * Tests whether the given term may exist in the specified field. | ||
| * | ||
| * @param field the field name to check | ||
| * @param term the term to test for membership | ||
| * @return true if term may be present, false if definitely absent | ||
| */ | ||
| boolean mayContainTerm(String field, BytesRef term) throws IOException; | ||
|
|
||
| boolean isFilterAvailable(); | ||
| } |
Uh oh!
There was an error while loading. Please reload this page.