From e562e8c6f4ea97d210c80800d92ea6b5945aeab2 Mon Sep 17 00:00:00 2001 From: tlrx Date: Fri, 24 Oct 2025 18:34:18 +0200 Subject: [PATCH 01/20] Change document _id format for time series datastreams --- .../datastreams/TSDBSyntheticIdsIT.java | 261 +++++++-- .../cluster/routing/IndexRouting.java | 26 +- .../lucene/uid/VersionsAndSeqNoResolver.java | 31 +- .../elasticsearch/index/IndexVersions.java | 1 + .../codec/tsdb/TSDBSyntheticIdCodec.java | 51 ++ .../tsdb/TSDBSyntheticIdFieldsProducer.java | 505 +++++++++++++----- .../tsdb/TSDBSyntheticIdPostingsFormat.java | 2 + .../index/engine/InternalEngine.java | 13 +- .../index/mapper/ParsedDocument.java | 29 +- .../mapper/TsidExtractingIdFieldMapper.java | 56 +- 10 files changed, 767 insertions(+), 208 deletions(-) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index b0d14d0d80221..81e99d154060d 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -34,15 +34,18 @@ import java.io.IOException; import java.time.Instant; +import java.time.temporal.ChronoUnit; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Random; import static org.elasticsearch.common.time.FormatNames.STRICT_DATE_OPTIONAL_TIME; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; @@ -97,51 +100,56 @@ public void testInvalidIndexMode() { @TestLogging(reason = "debug", value = "org.elasticsearch.index.engine.Engine:TRACE") public void testSyntheticId() throws Exception { assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG); - final var indexName = randomIdentifier(); - putDataStreamTemplate(random(), indexName); + final var dataStreamName = randomIdentifier(); + putDataStreamTemplate(dataStreamName, randomIntBetween(1, 3)); + final var docs = new HashMap(); + final var unit = randomFrom(ChronoUnit.SECONDS, ChronoUnit.MINUTES); final var timestamp = Instant.now(); - // Index 5 docs in datastream + // Index 10 docs in datastream + // + // For convenience, the metric value maps the index in the bulk response items var results = createDocuments( - indexName, - document(timestamp, "vm-dev01", "cpu-load", 0), // will be updated - document(timestamp.plusSeconds(2), "vm-dev01", "cpu-load", 1), // will be deleted - document(timestamp, "vm-dev02", "cpu-load", 2), - document(timestamp.plusSeconds(2), "vm-dev03", "cpu-load", 3), - document(timestamp.plusSeconds(3), "vm-dev03", "cpu-load", 4) + dataStreamName, + // t + 0s + document(timestamp, "vm-dev01", "cpu-load", 0), + document(timestamp, "vm-dev02", "cpu-load", 1), + // t + 1s + document(timestamp.plus(1, unit), "vm-dev01", "cpu-load", 2), + document(timestamp.plus(1, unit), "vm-dev02", "cpu-load", 3), + // t + 0s out-of-order doc + document(timestamp, "vm-dev03", "cpu-load", 4), + // t + 2s + document(timestamp.plus(2, unit), "vm-dev01", "cpu-load", 5), + document(timestamp.plus(2, unit), "vm-dev02", "cpu-load", 6), + // t - 1s out-of-order doc + document(timestamp.minus(1, unit), "vm-dev01", "cpu-load", 7), + // t + 3s + document(timestamp.plus(3, unit), "vm-dev01", "cpu-load", 8), + document(timestamp.plus(3, unit), "vm-dev02", "cpu-load", 9) ); - // Verify documents - assertThat(results[0].getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); - assertThat(results[0].getVersion(), equalTo(1L)); - - assertThat(results[1].getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); - assertThat(results[1].getVersion(), equalTo(1L)); - - assertThat(results[2].getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); - assertThat(results[2].getVersion(), equalTo(1L)); - - assertThat(results[3].getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); - assertThat(results[3].getVersion(), equalTo(1L)); - - assertThat(results[4].getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); - assertThat(results[4].getVersion(), equalTo(1L)); - - final var docIndex = results[1].getIndex(); - final var docId = results[1].getId(); + // Verify that documents are created + for (var result : results) { + assertThat(result.getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); + assertThat(result.getVersion(), equalTo(1L)); + docs.put(result.getId(), result.getIndex()); + } enum Operation { FLUSH, REFRESH, NONE } + + // Random flush or refresh or nothing, so that the next GETs are executed on flushed segments or in memory segments. switch (randomFrom(Operation.values())) { case FLUSH: - flush(indexName); + flush(dataStreamName); break; case REFRESH: - refresh(indexName); + refresh(dataStreamName); break; case NONE: default: @@ -149,46 +157,183 @@ enum Operation { } // Get by synthetic _id - // Note: before synthetic _id this would have required postings on disks - var getResponse = client().prepareGet(docIndex, docId).setFetchSource(true).execute().actionGet(); - assertThat(getResponse.isExists(), equalTo(true)); - assertThat(getResponse.getVersion(), equalTo(1L)); - var source = asInstanceOf(Map.class, getResponse.getSourceAsMap().get("metric")); - assertThat(asInstanceOf(Integer.class, source.get("value")), equalTo(1)); + var randomDocs = randomSubsetOf(randomIntBetween(0, results.length), results); + for (var doc : randomDocs) { + boolean fetchSource = randomBoolean(); + var getResponse = client().prepareGet(doc.getIndex(), doc.getId()).setFetchSource(fetchSource).execute().actionGet(); + assertThat(getResponse.isExists(), equalTo(true)); + assertThat(getResponse.getVersion(), equalTo(1L)); + + if (fetchSource) { + var source = asInstanceOf(Map.class, getResponse.getSourceAsMap().get("metric")); + assertThat(asInstanceOf(Integer.class, source.get("value")), equalTo(doc.getItemId())); + } + } // Update by synthetic _id + // // Note: it doesn't work, is that expected? Is is blocked by IndexRouting.ExtractFromSource.updateShard + var updateDocId = randomFrom(docs.keySet()); + var updateDocIndex = docs.get(updateDocId); var exception = expectThrows(IllegalArgumentException.class, () -> { var doc = document(timestamp, "vm-dev01", "cpu-load", 10); // update - client().prepareUpdate(docIndex, docId).setDoc(doc).get(); + client().prepareUpdate(updateDocIndex, updateDocId).setDoc(doc).get(); }); assertThat( exception.getMessage(), - containsString("update is not supported because the destination index [" + docIndex + "] is in time_series mode") + containsString("update is not supported because the destination index [" + updateDocIndex + "] is in time_series mode") ); + // Random flush or refresh or nothing, so that the next DELETEs are executed on flushed segments or in memory segments. + switch (randomFrom(Operation.values())) { + case FLUSH: + flush(dataStreamName); + break; + case REFRESH: + refresh(dataStreamName); + break; + case NONE: + default: + break; + } + // Delete by synthetic _id - var deleteResponse = client().prepareDelete(docIndex, docId).get(); - assertThat(deleteResponse.getId(), equalTo(docId)); - assertThat(deleteResponse.getResult(), equalTo(DocWriteResponse.Result.DELETED)); - assertThat(deleteResponse.getVersion(), equalTo(2L)); - - // Index more docs - // TODO Randomize this to have segments only composed of deleted docs - createDocuments( - indexName, - document(timestamp.plusSeconds(4), "vm-dev03", "cpu-load", 5), - document(timestamp.plusSeconds(5), "vm-dev03", "cpu-load", 6) + var deletedDocs = randomSubsetOf(randomIntBetween(1, docs.size()), docs.keySet()); + for (var deletedDocId : deletedDocs) { + var deletedDocIndex = docs.get(deletedDocId); + + // Delete + var deleteResponse = client().prepareDelete(deletedDocIndex, deletedDocId).get(); + assertThat(deleteResponse.getId(), equalTo(deletedDocId)); + assertThat(deleteResponse.getIndex(), equalTo(deletedDocIndex)); + assertThat(deleteResponse.getResult(), equalTo(DocWriteResponse.Result.DELETED)); + assertThat(deleteResponse.getVersion(), equalTo(2L)); + + // Get returns "not found" + var getResponse = client().prepareGet(deletedDocIndex, deletedDocId).get(); + assertThat(getResponse.getId(), equalTo(deletedDocId)); + assertThat(getResponse.getIndex(), equalTo(deletedDocIndex)); + assertThat(getResponse.isExists(), equalTo(false)); + } + + flushAndRefresh(dataStreamName); + + // Check that synthetic _id field have no postings on disk + var indices = new HashSet<>(docs.values()); + for (var index : indices) { + var diskUsage = diskUsage(index); + var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME); + assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); + } + + /* This does not work :-( + assertCheckedResponse( + client().prepareSearch(dataStreamName).setTrackTotalHits(true), + searchResponse -> { + assertHitCount(searchResponse, docs.size() - deletedDocs.size()); + + // Verify that search response does not contain deleted docs + for (var searchHit : searchResponse.getHits()) { + assertThat(deletedDocs.contains(searchHit.getId()), equalTo(false)); + } + } + );*/ + } + + public void testGetFromTranslogBySyntheticId() throws Exception { + assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG); + final var datastreamName = randomIdentifier(); + putDataStreamTemplate(datastreamName, 1); + + final var docs = new HashMap(); + final var unit = randomFrom(ChronoUnit.SECONDS, ChronoUnit.MINUTES); + final var timestamp = Instant.now(); + + // Index 5 docs in datastream + // + // For convenience, the metric value maps the index in the bulk response items + var results = createDocuments( + datastreamName, + // t + 0s + document(timestamp, "vm-dev01", "cpu-load", 0), + document(timestamp, "vm-dev02", "cpu-load", 1), + // t + 1s + document(timestamp.plus(1, unit), "vm-dev01", "cpu-load", 2), + document(timestamp.plus(1, unit), "vm-dev02", "cpu-load", 3), + // t + 0s out-of-order doc + document(timestamp, "vm-dev03", "cpu-load", 4) ); - flushAndRefresh(indexName); + // Verify that documents are created + for (var result : results) { + assertThat(result.getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); + assertThat(result.getVersion(), equalTo(1L)); + docs.put(result.getId(), result.getIndex()); + } - // Check that synthetic _id field has no postings on disk - var diskUsage = diskUsage(docIndex); - var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME); - assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); + // Get by synthetic _id + // + // The documents are in memory buffers: the first GET will trigger the refresh of the internal reader + // (see InternalEngine.REAL_TIME_GET_REFRESH_SOURCE) to have an up-to-date searcher to resolve documents ids and versions. It will + // also enable the tracking of the locations of documents in the translog (see InternalEngine.trackTranslogLocation) so that next + // GETs will be resolved with the translog. + var randomDocs = randomSubsetOf(randomIntBetween(1, results.length), results); + for (var doc : randomDocs) { + var getResponse = client().prepareGet(doc.getIndex(), doc.getId()).setRealtime(true).setFetchSource(true).execute().actionGet(); + assertThat(getResponse.isExists(), equalTo(true)); + assertThat(getResponse.getVersion(), equalTo(1L)); + + var source = asInstanceOf(Map.class, getResponse.getSourceAsMap().get("metric")); + assertThat(asInstanceOf(Integer.class, source.get("value")), equalTo(doc.getItemId())); + } + + int metricOffset = results.length; + + // Index 5 more docs + results = createDocuments( + datastreamName, + // t + 2s + document(timestamp.plus(2, unit), "vm-dev01", "cpu-load", metricOffset), + document(timestamp.plus(2, unit), "vm-dev02", "cpu-load", metricOffset + 1), + // t - 1s out-of-order doc + document(timestamp.minus(1, unit), "vm-dev01", "cpu-load", metricOffset + 2), + // t + 3s + document(timestamp.plus(3, unit), "vm-dev01", "cpu-load", metricOffset + 3), + document(timestamp.plus(3, unit), "vm-dev02", "cpu-load", metricOffset + 4) + ); + + // Verify that documents are created + for (var result : results) { + assertThat(result.getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); + assertThat(result.getVersion(), equalTo(1L)); + docs.put(result.getId(), result.getIndex()); + } + + // Get by synthetic _id + // + // Documents ids and versions are resolved using the translog. Here we exercise the get-from-translog (that uses the + // TranslogDirectoryReader) and VersionsAndSeqNoResolver.loadDocIdAndVersionUncached paths. + randomDocs = randomSubsetOf(randomIntBetween(1, results.length), results); + for (var doc : randomDocs) { + var getResponse = client().prepareGet(doc.getIndex(), doc.getId()).setRealtime(true).setFetchSource(true).execute().actionGet(); + assertThat(getResponse.isExists(), equalTo(true)); + assertThat(getResponse.getVersion(), equalTo(1L)); + + var source = asInstanceOf(Map.class, getResponse.getSourceAsMap().get("metric")); + assertThat(asInstanceOf(Integer.class, source.get("value")), equalTo(metricOffset + doc.getItemId())); + } + + flushAndRefresh(datastreamName); + + // Check that synthetic _id field have no postings on disk + var indices = new HashSet<>(docs.values()); + for (var index : indices) { + var diskUsage = diskUsage(index); + var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME); + assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); + } - // TODO Search datastream and count hits + assertHitCount(client().prepareSearch(datastreamName).setSize(0), 10L); } private static XContentBuilder document(Instant timestamp, String hostName, String metricField, Integer metricValue) @@ -210,7 +355,7 @@ private static XContentBuilder document(Instant timestamp, String hostName, Stri return source; } - private static BulkItemResponse[] createDocuments(String indexName, XContentBuilder... docs) throws IOException { + private static BulkItemResponse[] createDocuments(String indexName, XContentBuilder... docs) { assertThat(docs, notNullValue()); final var client = client(); var bulkRequest = client.prepareBulk(); @@ -222,8 +367,8 @@ private static BulkItemResponse[] createDocuments(String indexName, XContentBuil return bulkResponse.getItems(); } - private static void putDataStreamTemplate(Random random, String indexPattern) throws IOException { - final var settings = indexSettings(1, 0).put(IndexSettings.MODE.getKey(), IndexMode.TIME_SERIES.getName()) + private static void putDataStreamTemplate(String indexPattern, int shards) throws IOException { + final var settings = indexSettings(shards, 0).put(IndexSettings.MODE.getKey(), IndexMode.TIME_SERIES.getName()) .put(IndexSettings.BLOOM_FILTER_ID_FIELD_ENABLED_SETTING.getKey(), false) .put(IndexSettings.INDEX_REFRESH_INTERVAL_SETTING.getKey(), -1) .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), true); diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java index 12d45898bfba5..3c1a7c8fcbaed 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java @@ -26,9 +26,11 @@ import org.elasticsearch.core.Nullable; import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexMode; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.TimeSeriesRoutingHashFieldMapper; +import org.elasticsearch.index.mapper.TsidExtractingIdFieldMapper; import org.elasticsearch.transport.Transports; import org.elasticsearch.xcontent.XContentParser; import org.elasticsearch.xcontent.XContentParserConfiguration; @@ -321,6 +323,7 @@ public abstract static class ExtractFromSource extends IndexRouting { protected final XContentParserConfiguration parserConfig; private final IndexMode indexMode; private final boolean trackTimeSeriesRoutingHash; + private final boolean useTimeSeriesSyntheticId; private final boolean addIdWithRoutingHash; private int hash = Integer.MAX_VALUE; @@ -333,6 +336,9 @@ public abstract static class ExtractFromSource extends IndexRouting { assert indexMode != null : "Index mode must be set for ExtractFromSource routing"; this.trackTimeSeriesRoutingHash = indexMode == IndexMode.TIME_SERIES && metadata.getCreationVersion().onOrAfter(IndexVersions.TIME_SERIES_ROUTING_HASH_IN_ID); + this.useTimeSeriesSyntheticId = trackTimeSeriesRoutingHash + && metadata.getCreationVersion().onOrAfter(IndexVersions.TIME_SERIES_USE_SYNTHETIC_ID) + && IndexSettings.USE_SYNTHETIC_ID.get(metadata.getSettings()); addIdWithRoutingHash = indexMode == IndexMode.LOGSDB; this.parserConfig = XContentParserConfiguration.EMPTY.withFiltering(null, Set.copyOf(includePaths), null, true); } @@ -391,6 +397,7 @@ public int updateShard(String id, @Nullable String routing) { public int deleteShard(String id, @Nullable String routing) { checkNoRouting(routing); int shardId = idToHash(id); + System.out.println("id " + id + " routed to " + shardId); return rerouteWritesIfResharding(shardId); } @@ -417,10 +424,19 @@ private int idToHash(String id) { if (idBytes.length < 4) { throw new ResourceNotFoundException("invalid id [{}] for index [{}] in " + indexMode.getName() + " mode", id, indexName); } - // For TSDB, the hash is stored as the id prefix. - // For LogsDB with routing on sort fields, the routing hash is stored in the range[id.length - 9, id.length - 5] of the id, - // see IndexRequest#autoGenerateTimeBasedId. - return hashToShardId(ByteUtils.readIntLE(idBytes, addIdWithRoutingHash ? idBytes.length - 9 : 0)); + int hash; + if (addIdWithRoutingHash) { + // For LogsDB with routing on sort fields, the routing hash is stored in the range[id.length - 9, id.length - 5] of the id, + // see IndexRequest#autoGenerateTimeBasedId. + hash = ByteUtils.readIntLE(idBytes, idBytes.length - 9); + } else if (useTimeSeriesSyntheticId) { + // For TSDB with synthetic ids, the hash is stored as the id suffix. + hash = TsidExtractingIdFieldMapper.extractRoutingHashFromSyntheticId(idBytes); + } else { + // For TSDB, the hash is stored as the id prefix. + hash = ByteUtils.readIntLE(idBytes, 0); + } + return hashToShardId(hash); } @Override @@ -510,7 +526,7 @@ public static class ForIndexDimensions extends ExtractFromSource { @Override protected int hashSource(IndexRequest indexRequest) { - // System.out.println("hashSource for tsid"); + System.out.println("hashSource for tsid"); BytesRef tsid = indexRequest.tsid(); if (tsid == null) { tsid = buildTsid(indexRequest.getContentType(), indexRequest.indexSource().bytes()); diff --git a/server/src/main/java/org/elasticsearch/common/lucene/uid/VersionsAndSeqNoResolver.java b/server/src/main/java/org/elasticsearch/common/lucene/uid/VersionsAndSeqNoResolver.java index 08a8e28457159..b5485f3cbf1f9 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/uid/VersionsAndSeqNoResolver.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/uid/VersionsAndSeqNoResolver.java @@ -14,9 +14,9 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CloseableThreadLocal; -import org.elasticsearch.common.util.ByteUtils; import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.core.Assertions; +import org.elasticsearch.index.mapper.TsidExtractingIdFieldMapper; import java.io.IOException; import java.util.Base64; @@ -153,22 +153,29 @@ public static DocIdAndVersion timeSeriesLoadDocIdAndVersion(IndexReader reader, * This allows this method to know whether there is no document with the specified id without loading the docid for * the specified id. * - * @param reader The reader load docid, version and seqno from. - * @param uid The term that describes the uid of the document to load docid, version and seqno for. - * @param id The id that contains the encoded timestamp. The timestamp is used to skip checking the id for entire segments. - * @param loadSeqNo Whether to load sequence number from _seq_no doc values field. + * @param reader The reader load docid, version and seqno from. + * @param uid The term that describes the uid of the document to load docid, version and seqno for. + * @param id The id that contains the encoded timestamp. The timestamp is used to skip checking the id for entire segments. + * @param loadSeqNo Whether to load sequence number from _seq_no doc values field. + * @param useSyntheticId Whether the id is a synthetic (true) or standard (false ) document id. * @return the internal doc ID and version for the specified term from the specified reader or * returning null if no document was found for the specified id * @throws IOException In case of an i/o related failure */ - public static DocIdAndVersion timeSeriesLoadDocIdAndVersion(IndexReader reader, BytesRef uid, String id, boolean loadSeqNo) - throws IOException { + public static DocIdAndVersion timeSeriesLoadDocIdAndVersion( + IndexReader reader, + BytesRef uid, + String id, + boolean loadSeqNo, + boolean useSyntheticId + ) throws IOException { byte[] idAsBytes = Base64.getUrlDecoder().decode(id); - assert idAsBytes.length == 20; - // id format: [4 bytes (basic hash routing fields), 8 bytes prefix of 128 murmurhash dimension fields, 8 bytes - // @timestamp) - long timestamp = ByteUtils.readLongBE(idAsBytes, 12); - + final long timestamp; + if (useSyntheticId) { + timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(idAsBytes); + } else { + timestamp = TsidExtractingIdFieldMapper.extractTimestampFromId(idAsBytes); + } PerThreadIDVersionAndSeqNoLookup[] lookups = getLookupState(reader, true); List leaves = reader.leaves(); // iterate in default order, the segments should be sorted by DataStream#TIMESERIES_LEAF_READERS_SORTER diff --git a/server/src/main/java/org/elasticsearch/index/IndexVersions.java b/server/src/main/java/org/elasticsearch/index/IndexVersions.java index e63b655e2ce8d..172bdc67e7872 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexVersions.java +++ b/server/src/main/java/org/elasticsearch/index/IndexVersions.java @@ -192,6 +192,7 @@ private static Version parseUnchecked(String version) { public static final IndexVersion REENABLED_TIMESTAMP_DOC_VALUES_SPARSE_INDEX = def(9_042_0_00, Version.LUCENE_10_3_1); public static final IndexVersion SKIPPERS_ENABLED_BY_DEFAULT = def(9_043_0_00, Version.LUCENE_10_3_1); + public static final IndexVersion TIME_SERIES_USE_SYNTHETIC_ID = def(9_044_0_00, Version.LUCENE_10_3_1); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java index 970664844631a..4d885fbc88e1a 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java @@ -27,6 +27,7 @@ import static org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdPostingsFormat.SYNTHETIC_ID; import static org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdPostingsFormat.TIMESTAMP; import static org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdPostingsFormat.TS_ID; +import static org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdPostingsFormat.TS_ROUTING_HASH; /** * Special codec for time-series datastreams that use synthetic ids. @@ -83,6 +84,13 @@ private void ensureSyntheticIdFields(FieldInfos fieldInfos) { assert false : message; throw new IllegalArgumentException(message); } + // Ensure _ts_routing_hash exists + fi = fieldInfos.fieldInfo(TS_ROUTING_HASH); + if (fi == null) { + var message = "Field [" + TS_ROUTING_HASH + "] does not exist"; + assert false : message; + throw new IllegalArgumentException(message); + } // Ensure _id exists and not indexed fi = fieldInfos.fieldInfo(SYNTHETIC_ID); if (fi == null) { @@ -102,6 +110,49 @@ private void ensureSyntheticIdFields(FieldInfos fieldInfos) { @Override public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos fieldInfos, IOContext context) throws IOException { + + // Change the _id field index options from IndexOptions.DOCS to IndexOptions.NONE + final var infos = new FieldInfo[fieldInfos.size()]; + int i = 0; + for (FieldInfo fi : fieldInfos) { + if (SYNTHETIC_ID.equals(fi.getName())) { + final var attributes = new HashMap<>(fi.attributes()); + + // Assert that PerFieldPostingsFormat are not present or have the expected format and suffix + assert attributes.get(PerFieldPostingsFormat.PER_FIELD_FORMAT_KEY) == null + || TSDBSyntheticIdPostingsFormat.FORMAT_NAME.equals(attributes.get(PerFieldPostingsFormat.PER_FIELD_FORMAT_KEY)); + assert attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY) == null + || TSDBSyntheticIdPostingsFormat.SUFFIX.equals(attributes.get(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY)); + + // Remove attributes if present + attributes.remove(PerFieldPostingsFormat.PER_FIELD_FORMAT_KEY); + attributes.remove(PerFieldPostingsFormat.PER_FIELD_SUFFIX_KEY); + + fi = new FieldInfo( + fi.getName(), + fi.getFieldNumber(), + fi.hasTermVectors(), + true, + fi.hasPayloads(), + IndexOptions.NONE, + fi.getDocValuesType(), + fi.docValuesSkipIndexType(), + fi.getDocValuesGen(), + attributes, + fi.getPointDimensionCount(), + fi.getPointIndexDimensionCount(), + fi.getPointNumBytes(), + fi.getVectorDimension(), + fi.getVectorEncoding(), + fi.getVectorSimilarityFunction(), + fi.isSoftDeletesField(), + fi.isParentField() + ); + } + infos[i++] = fi; + } + + fieldInfos = new FieldInfos(infos); ensureSyntheticIdFields(fieldInfos); delegate.write(directory, segmentInfo, segmentSuffix, fieldInfos, context); } diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java index 2f624fd2d9cd0..2b70d7a9adfef 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java @@ -12,6 +12,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.PostingsEnum; @@ -28,7 +29,6 @@ import org.elasticsearch.index.mapper.Uid; import java.io.IOException; -import java.io.UncheckedIOException; import java.util.Iterator; import java.util.Objects; import java.util.Set; @@ -85,7 +85,7 @@ public Terms terms(String field) throws IOException { return new Terms() { @Override public TermsEnum iterator() { - return new FakeTermsEnum(); + return new SyntheticIdTermsEnum(); } @Override @@ -131,97 +131,377 @@ public boolean hasPayloads() { } /** - * This is a fake TermsEnum that scans all documents for find docs matching a specific _id. This implementation is only here to show - * that the synthetic _id terms is used when applying doc values updates during soft-updates. It is buggy and should not be used besides - * some carefully crafted integration tests, because it relies on the current _id format for TSDB indices that has limitations: - * - it is composed of a routing hash, a @timestamp and a tsid that cannot be un-hashed so all docs must be scanned to find matchings - * - it is not sorted on _id in the Lucene segments so doc values updates stop too early when applying DV updates - * - * This fake terms enumeration will be changed to support a different _id format in a short future. + * Holds all the doc values used in the {@link TermsEnum} and {@link PostingsEnum} to lookup and to build synthetic _ids, along with + * some utility methods to access doc values. + *

+ * It holds the instance of {@link DocValuesProducer} used to create the sorted doc values for _tsid, @timestamp and + * _ts_routing_hash. Because doc values can only advance, they are re-created from the {@link DocValuesProducer} when we need to + * seek backward. + *

*/ - private class FakeTermsEnum extends BaseTermsEnum { + private static class DocValuesHolder { + + private final FieldInfo tsIdFieldInfo; + private final FieldInfo timestampFieldInfo; + private final FieldInfo routingHashFieldInfo; + private final DocValuesProducer docValuesProducer; + + private SortedNumericDocValues timestampDocValues; // sorted desc. order + private SortedDocValues routingHashDocValues; // sorted asc. order + private SortedDocValues tsIdDocValues; // sorted asc. order + // Keep around the latest tsId ordinal and value + private int cachedTsIdOrd = -1; + private BytesRef cachedTsId; + + private DocValuesHolder(FieldInfos fieldInfos, DocValuesProducer docValuesProducer) { + this.tsIdFieldInfo = safeFieldInfo(fieldInfos, TSDBSyntheticIdPostingsFormat.TS_ID); + this.timestampFieldInfo = safeFieldInfo(fieldInfos, TSDBSyntheticIdPostingsFormat.TIMESTAMP); + this.routingHashFieldInfo = safeFieldInfo(fieldInfos, TSDBSyntheticIdPostingsFormat.TS_ROUTING_HASH); + this.docValuesProducer = docValuesProducer; + } - private BytesRef term = null; - private int docID = -1; + private FieldInfo safeFieldInfo(FieldInfos fieldInfos, String fieldName) { + var fi = fieldInfos.fieldInfo(fieldName); + if (fi == null) { + var message = "Field [" + fieldName + "] does not exist"; + assert false : message; + throw new IllegalArgumentException(message); + } + return fi; + } + + /** + * Returns the _tsid ordinal value for a given docID. The document ID must exist and must have a value for the field. + * + * @param docID the docID + * @return the _tsid ordinal value + * @throws IOException if any I/O exception occurs + */ + private int docTsIdOrdinal(int docID) throws IOException { + if (tsIdDocValues == null || tsIdDocValues.docID() > docID) { + tsIdDocValues = docValuesProducer.getSorted(tsIdFieldInfo); + cachedTsIdOrd = -1; + cachedTsId = null; + } + boolean found = tsIdDocValues.advanceExact(docID); + assert found : "No value found for field [" + tsIdFieldInfo.getName() + " and docID " + docID; + return tsIdDocValues.ordValue(); + } + + /** + * Returns the timestamp value for a given docID. The document ID must exist and must have a value for the field. + * + * @param docID the docID + * @return the timestamp value + * @throws IOException if any I/O exception occurs + */ + private long docTimestamp(int docID) throws IOException { + if (timestampDocValues == null || timestampDocValues.docID() > docID) { + timestampDocValues = docValuesProducer.getSortedNumeric(timestampFieldInfo); + } + boolean found = timestampDocValues.advanceExact(docID); + assert found : "No value found for field [" + timestampFieldInfo.getName() + " and docID " + docID; + assert timestampDocValues.docValueCount() == 1; + return timestampDocValues.nextValue(); + } + + /** + * Returns the routing hash value for a given docID. The document ID must exist and must have a value for the field. + * + * @param docID the docID + * @return the routing hash value + * @throws IOException if any I/O exception occurs + */ + private BytesRef docRoutingHash(int docID) throws IOException { + if (routingHashDocValues == null || routingHashDocValues.docID() > docID) { + routingHashDocValues = docValuesProducer.getSorted(routingHashFieldInfo); + } + boolean found = routingHashDocValues.advanceExact(docID); + assert found : "No value found for field [" + routingHashFieldInfo.getName() + " and docID " + docID; + return routingHashDocValues.lookupOrd(routingHashDocValues.ordValue()); + } + + /** + * Lookup if a given _tsid exists, returning a positive ordinal if it exists otherwise it returns -insertionPoint-1. + * + * @param tsId the _tsid to look up + * @return a positive ordinal if the _tsid exists, else returns -insertionPoint-1. + * @throws IOException if any I/O exception occurs + */ + private int lookupTsIdTerm(BytesRef tsId) throws IOException { + int compare = Integer.MAX_VALUE; + if (cachedTsId != null) { + compare = cachedTsId.compareTo(tsId); + if (compare == 0) { + return cachedTsIdOrd; + } + } + if (tsIdDocValues == null || compare > 0) { + tsIdDocValues = docValuesProducer.getSorted(tsIdFieldInfo); + cachedTsIdOrd = -1; + cachedTsId = null; + } + int ordinal = tsIdDocValues.lookupTerm(tsId); + if (0 <= ordinal) { + cachedTsIdOrd = ordinal; + cachedTsId = tsId; + } + return ordinal; + } - private BytesRef latestTsId = null; - private long latestTimestamp = -1L; + /** + * Lookup the _tsid value for the given ordinal. + * + * @param tsIdOrdinal the _tsid ordinal + * @return the _tsid value + * @throws IOException if any I/O exception occurs + */ + private BytesRef lookupTsIdOrd(int tsIdOrdinal) throws IOException { + if (cachedTsIdOrd != -1 && cachedTsIdOrd == tsIdOrdinal) { + return cachedTsId; + } + if (tsIdDocValues == null || tsIdDocValues.ordValue() > tsIdOrdinal) { + tsIdDocValues = docValuesProducer.getSorted(tsIdFieldInfo); + cachedTsIdOrd = -1; + cachedTsId = null; + } + assert 0 <= tsIdOrdinal : tsIdOrdinal; + assert tsIdOrdinal < tsIdDocValues.getValueCount() : tsIdOrdinal; + var tsId = tsIdDocValues.lookupOrd(tsIdOrdinal); + if (tsId != null) { + cachedTsIdOrd = tsIdOrdinal; + cachedTsId = tsId; + } + return tsId; + } - private FakeTermsEnum() {} + /** + * Scan all documents to find the first document that has a _tsid equal or greater than the provided _tsid ordinal, returning its + * document ID. If no document is found, the method returns {@link DocIdSetIterator#NO_MORE_DOCS}. + * + * Warning: This method is very slow because it potentially scans all documents in the segment. + */ + private int slowScanToFirstDocWithTsIdOrdinalEqualOrGreaterThan(int tsIdOrd) throws IOException { + // recreate even if doc values are already on the same ordinal, to ensure the method returns the first doc + if (tsIdDocValues == null || (cachedTsIdOrd != -1 && cachedTsIdOrd >= tsIdOrd)) { // can't use tsIdDocValues.ordValue() here?? + tsIdDocValues = docValuesProducer.getSorted(tsIdFieldInfo); + cachedTsIdOrd = -1; + cachedTsId = null; + } + assert 0 <= tsIdOrd : tsIdOrd; + assert tsIdOrd < tsIdDocValues.getValueCount() : tsIdOrd; + + for (int docID = 0; docID != DocIdSetIterator.NO_MORE_DOCS; docID = tsIdDocValues.nextDoc()) { + boolean found = tsIdDocValues.advanceExact(docID); + assert found : "No value found for field [" + tsIdFieldInfo.getName() + " and docID " + docID; + var ord = tsIdDocValues.ordValue(); + if (ord == tsIdOrd || tsIdOrd < ord) { + if (ord != cachedTsIdOrd) { + cachedTsId = tsIdDocValues.lookupOrd(ord); + cachedTsIdOrd = ord; + } + return docID; + } + } + cachedTsIdOrd = -1; + cachedTsId = null; + return DocIdSetIterator.NO_MORE_DOCS; + } + + /** + * Scan all documents to find the first document that has a _tsid equal to the provided _tsid ordinal, returning its + * document ID. If no document is found, the method returns {@link DocIdSetIterator#NO_MORE_DOCS}. + * + * Warning: This method is very slow because it potentially scans all documents in the segment. + */ + private int slowScanToFirstDocWithTsIdOrdinalEqualTo(int tsIdOrd) throws IOException { + // recreate even if doc values are already on the same ordinal, to ensure the method returns the first doc + if (tsIdDocValues == null || (cachedTsIdOrd != -1 && cachedTsIdOrd >= tsIdOrd)) { // can't use tsIdDocValues.ordValue() here?? + tsIdDocValues = docValuesProducer.getSorted(tsIdFieldInfo); + cachedTsIdOrd = -1; + cachedTsId = null; + } + assert 0 <= tsIdOrd : tsIdOrd; + assert tsIdOrd < tsIdDocValues.getValueCount() : tsIdOrd; + + for (int docID = 0; docID != DocIdSetIterator.NO_MORE_DOCS; docID = tsIdDocValues.nextDoc()) { + boolean found = tsIdDocValues.advanceExact(docID); + assert found : "No value found for field [" + tsIdFieldInfo.getName() + " and docID " + docID; + var ord = tsIdDocValues.ordValue(); + if (ord == tsIdOrd) { + if (ord != cachedTsIdOrd) { + cachedTsId = tsIdDocValues.lookupOrd(ord); + cachedTsIdOrd = ord; + } + return docID; + } else if (tsIdOrd < ord) { + break; + } + } + cachedTsIdOrd = -1; + cachedTsId = null; + assert false : "Method must be called with an existing _tsid ordinal: " + tsIdOrd; + return DocIdSetIterator.NO_MORE_DOCS; + } + + private int getTsIdValueCount() throws IOException { + if (tsIdDocValues == null) { + tsIdDocValues = docValuesProducer.getSorted(tsIdFieldInfo); + } + return tsIdDocValues.getValueCount(); + } + } + + /** + * Represents the synthetic term the {@link TermsEnum} or {@link PostingsEnum} is positioned on. It points to a given docID and its + * corresponding _tsid, @timestamp and _ts_routing_hash values. The {@link #term()} method returns the synthetic _id of the document. + */ + private record SyntheticTerm(int docID, int tsIdOrd, BytesRef tsId, long timestamp, BytesRef routingHash) { + private BytesRef term() { + assert docID >= 0 : docID; + assert tsIdOrd >= 0 : tsIdOrd; + return syntheticId(tsId, timestamp, routingHash); + } + } + + /** + * When returned by next(), seekCeil(), nextDoc() and docID() it means there are no more synthetic terms in the {@link TermsEnum} + * or {@link PostingsEnum}. + */ + private static final SyntheticTerm NO_MORE_DOCS = new SyntheticTerm(DocIdSetIterator.NO_MORE_DOCS, -1, null, -1L, null); + + /** + * {@link TermsEnum} to iterate over documents synthetic _ids. + */ + private class SyntheticIdTermsEnum extends BaseTermsEnum { + + /** + * Holds all doc values that composed the synthetic _id + */ + private final DocValuesHolder docValues; + + /** + * Current synthetic term the enum is positioned on. It points to 1 document. + */ + private SyntheticTerm current; + + private SyntheticIdTermsEnum() { + this.docValues = new DocValuesHolder(fieldInfos, docValuesProducer); + this.current = null; + } + + private void ensurePositioned() { + if (current == null || current == NO_MORE_DOCS) { + assert false; + throw new IllegalStateException("Method should not be called when unpositioned"); + } + } @Override public BytesRef next() throws IOException { - if (docID == DocIdSetIterator.NO_MORE_DOCS) { - assert term == null; + if (current == NO_MORE_DOCS) { return null; } - docID += 1; + int docID = (current != null) ? current.docID + 1 : 0; if (maxDocs <= docID) { - docID = DocIdSetIterator.NO_MORE_DOCS; - latestTimestamp = -1L; - latestTsId = null; - term = null; + current = NO_MORE_DOCS; return null; } - - // Retrieve _tsid - SortedDocValues tsIdDocValues = docValuesProducer.getSorted(fieldInfos.fieldInfo(TS_ID)); - boolean found = tsIdDocValues.advanceExact(docID); - assert found; - int tsIdOrd = tsIdDocValues.ordValue(); - BytesRef tsId = tsIdDocValues.lookupOrd(tsIdOrd); - assert tsId != null; - - // Retrieve timestamp - SortedNumericDocValues timestampDocValues = docValuesProducer.getSortedNumeric(fieldInfos.fieldInfo(TIMESTAMP)); - found = timestampDocValues.advanceExact(docID); - assert found; - assert timestampDocValues.docValueCount() == 1; - long timestamp = timestampDocValues.nextValue(); - - // Retrieve routing hash - var tsRoutingHash = fieldInfos.fieldInfo(TimeSeriesRoutingHashFieldMapper.NAME); - assert tsRoutingHash != null; - SortedDocValues routingHashDocValues = docValuesProducer.getSorted(tsRoutingHash); - found = routingHashDocValues.advanceExact(docID); - assert found; - BytesRef routingHashBytes = routingHashDocValues.lookupOrd(routingHashDocValues.ordValue()); - - int routingHash = TimeSeriesRoutingHashFieldMapper.decode( - Uid.decodeId(routingHashBytes.bytes, routingHashBytes.offset, routingHashBytes.length) + int tsIdOrdinal = docValues.docTsIdOrdinal(docID); + current = new SyntheticTerm( + docID, + tsIdOrdinal, + docValues.lookupTsIdOrd(tsIdOrdinal), + docValues.docTimestamp(docID), + docValues.docRoutingHash(docID) ); - term = Uid.encodeId(TsidExtractingIdFieldMapper.createId(routingHash, tsId, timestamp)); - latestTimestamp = timestamp; - latestTsId = tsId; - return term; + return current.term(); } @Override - public SeekStatus seekCeil(BytesRef id) { + public SeekStatus seekCeil(BytesRef id) throws IOException { assert id != null; - if (term != null && term.equals(id)) { - return SeekStatus.FOUND; + assert Long.BYTES + Integer.BYTES < id.length : id.length; + if (id == null || id.length <= Long.BYTES + Integer.BYTES) { + return SeekStatus.NOT_FOUND; } - try { - while (next() != null) { - if (term.equals(id)) { - return SeekStatus.FOUND; + + // Extract the _tsid + final BytesRef tsId = TsidExtractingIdFieldMapper.extractTimeSeriesIdFromSyntheticId(id.bytes); + int tsIdOrd = docValues.lookupTsIdTerm(tsId); + + // _tsid not found + if (tsIdOrd < 0) { + tsIdOrd = -tsIdOrd - 1; + // set the terms enum on the first non-matching document + if (tsIdOrd < docValues.getTsIdValueCount()) { + int docID = docValues.slowScanToFirstDocWithTsIdOrdinalEqualOrGreaterThan(tsIdOrd); + if (docID != DocIdSetIterator.NO_MORE_DOCS) { + current = new SyntheticTerm( + docID, + tsIdOrd, + docValues.lookupTsIdOrd(tsIdOrd), + docValues.docTimestamp(docID), + docValues.docRoutingHash(docID) + ); + return SeekStatus.NOT_FOUND; } } - } catch (IOException e) { - throw new UncheckedIOException(e); + // no docs/terms to iterate on + current = NO_MORE_DOCS; + return SeekStatus.END; } - return SeekStatus.END; + + // _tsid found, extract the timestamp + final long timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(id.bytes); + + // Slow scan to the first document matching the _tsid + final int startDocID = docValues.slowScanToFirstDocWithTsIdOrdinalEqualTo(tsIdOrd); + assert 0 <= startDocID : startDocID; + + int docID = startDocID; + int docTsIdOrd = tsIdOrd; + long docTimestamp = -1; + + // Iterate over documents to find the first one matching the timestamp + for (; docID < maxDocs; docID++) { + docTimestamp = docValues.docTimestamp(docID); + if (startDocID < docID) { + // After the first doc, we need to check again if _tsid matches + docTsIdOrd = docValues.docTsIdOrdinal(docID); + } + if (docTsIdOrd == tsIdOrd && docTimestamp == timestamp) { + // It's a match! + current = new SyntheticTerm(docID, tsIdOrd, tsId, docTimestamp, docValues.docRoutingHash(docID)); + return SeekStatus.FOUND; + } + // Remaining docs don't match, stop here + if (tsIdOrd < docTsIdOrd || docTimestamp < timestamp) { + break; + } + } + + // set the terms enum on the first non-matching document + current = new SyntheticTerm( + docID, + docTsIdOrd, + docValues.lookupTsIdOrd(docTsIdOrd), + docTimestamp, + docValues.docRoutingHash(docID) + ); + return SeekStatus.NOT_FOUND; } @Override public BytesRef term() { - return term; + ensurePositioned(); + return current.term(); } @Override public PostingsEnum postings(PostingsEnum reuse, int flags) { - return new FakePostingsEnum(docID, latestTsId, latestTimestamp, maxDocs); + ensurePositioned(); + return new SyntheticIdPostingsEnum(current); } /** @@ -258,23 +538,19 @@ public ImpactsEnum impacts(int flags) throws IOException { } } - /** - * Do not use in production. See {@link FakeTermsEnum}. - */ - private class FakePostingsEnum extends PostingsEnum { + private class SyntheticIdPostingsEnum extends PostingsEnum { - private final int startDocID; - private final BytesRef latestTsId; - private final long latestTimestamp; - private final int maxDocs; - private int docID; + private final DocValuesHolder docValues; - private FakePostingsEnum(int docID, BytesRef latestTsId, long latestTimestamp, int maxDocs) { - this.startDocID = docID; - this.latestTsId = latestTsId; - this.latestTimestamp = latestTimestamp; - this.maxDocs = maxDocs; - this.docID = -1; + /** + * Current synthetic term the postings is pinned on. + */ + private final SyntheticTerm term; + private int docID = -1; + + private SyntheticIdPostingsEnum(SyntheticTerm term) { + this.docValues = new DocValuesHolder(fieldInfos, docValuesProducer); + this.term = Objects.requireNonNull(term); } @Override @@ -286,61 +562,27 @@ public int docID() { public int nextDoc() throws IOException { if (docID == DocIdSetIterator.NO_MORE_DOCS) { return docID; - } else if (docID == -1) { - docID = startDocID; - } else { - docID = docID + 1; - if (maxDocs <= docID) { - docID = DocIdSetIterator.NO_MORE_DOCS; - return docID; - } - } - - // Retrieve _tsid - SortedDocValues tsIdDocValues = docValuesProducer.getSorted(fieldInfos.fieldInfo(TS_ID)); - boolean found = tsIdDocValues.advanceExact(docID); - assert found; - int tsIdOrd = tsIdDocValues.ordValue(); - BytesRef tsId = tsIdDocValues.lookupOrd(tsIdOrd); - assert tsId != null; - - if (latestTsId != null && latestTsId.equals(tsId) == false) { - // Different _tsid, stop here - docID = DocIdSetIterator.NO_MORE_DOCS; - return docID; } - - // Retrieve timestamp - SortedNumericDocValues timestampDocValues = docValuesProducer.getSortedNumeric(fieldInfos.fieldInfo(TIMESTAMP)); - found = timestampDocValues.advanceExact(docID); - assert found; - assert timestampDocValues.docValueCount() == 1; - long timestamp = timestampDocValues.nextValue(); - - if (latestTimestamp != -1L && latestTimestamp != timestamp) { - // Different @timestamp, stop here - docID = DocIdSetIterator.NO_MORE_DOCS; - return docID; + int nextDocID = (docID == -1) ? term.docID() : docID + 1; + if (nextDocID < maxDocs) { + int tsIdOrd = docValues.docTsIdOrdinal(nextDocID); + if (tsIdOrd == term.tsIdOrd()) { + long timestamp = docValues.docTimestamp(nextDocID); + if (timestamp == term.timestamp()) { + assert Objects.equals(docValues.docRoutingHash(nextDocID), term.routingHash()); + assert Objects.equals(docValues.lookupTsIdOrd(tsIdOrd), term.tsId()); + docID = nextDocID; + return docID; + } + } } - - // Retrieve routing hash - var tsRoutingHash = fieldInfos.fieldInfo(TimeSeriesRoutingHashFieldMapper.NAME); - assert tsRoutingHash != null; - SortedDocValues routingHashDocValues = docValuesProducer.getSorted(tsRoutingHash); - found = routingHashDocValues.advanceExact(docID); - assert found; - BytesRef routingHashBytes = routingHashDocValues.lookupOrd(routingHashDocValues.ordValue()); - assert routingHashBytes != null; + docID = DocIdSetIterator.NO_MORE_DOCS; return docID; } @Override public int advance(int target) throws IOException { - int doc; - while ((doc = nextDoc()) < target) { - // Continue - } - return doc; + return slowAdvance(target); } @Override @@ -374,6 +616,15 @@ public BytesRef getPayload() throws IOException { } } + private static BytesRef syntheticId(BytesRef tsId, long timestamp, BytesRef routingHashBytes) { + assert tsId != null; + assert timestamp > 0L; + assert routingHashBytes != null; + String routingHashString = Uid.decodeId(routingHashBytes.bytes, routingHashBytes.offset, routingHashBytes.length); + int routingHash = TimeSeriesRoutingHashFieldMapper.decode(routingHashString); + return TsidExtractingIdFieldMapper.createSyntheticIdBytesRef(tsId, timestamp, routingHash); + } + private static boolean assertFieldInfosExist(FieldInfos fieldInfos, String... fieldNames) { assert fieldNames != null && fieldNames.length > 0 : "fieldNames should be > 0"; for (var fieldName : fieldNames) { diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdPostingsFormat.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdPostingsFormat.java index cfe9975f33a1b..66a6aa7151c6b 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdPostingsFormat.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdPostingsFormat.java @@ -19,6 +19,7 @@ import org.elasticsearch.index.mapper.DataStreamTimestampFieldMapper; import org.elasticsearch.index.mapper.SyntheticIdField; import org.elasticsearch.index.mapper.TimeSeriesIdFieldMapper; +import org.elasticsearch.index.mapper.TimeSeriesRoutingHashFieldMapper; import java.io.IOException; @@ -27,6 +28,7 @@ public class TSDBSyntheticIdPostingsFormat extends PostingsFormat { public static final String SYNTHETIC_ID = SyntheticIdField.NAME; public static final String TIMESTAMP = DataStreamTimestampFieldMapper.DEFAULT_PATH; public static final String TS_ID = TimeSeriesIdFieldMapper.NAME; + public static final String TS_ROUTING_HASH = TimeSeriesRoutingHashFieldMapper.NAME; static final String FORMAT_NAME = "TSDBSyntheticId"; static final String SUFFIX = "0"; diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java index 47e9ab7803a84..7a114b762f952 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java @@ -1067,7 +1067,13 @@ private VersionValue resolveDocVersion(final Operation op, boolean loadSeqNo) th directoryReader -> { if (engineConfig.getIndexSettings().getMode() == IndexMode.TIME_SERIES) { assert engineConfig.getLeafSorter() == DataStream.TIMESERIES_LEAF_READERS_SORTER; - return VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, op.uid(), op.id(), loadSeqNo); + return VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion( + directoryReader, + op.uid(), + op.id(), + loadSeqNo, + useTsdbSyntheticId + ); } else { return VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, op.uid(), loadSeqNo); } @@ -1859,8 +1865,10 @@ private DeleteResult deleteInLucene(Delete delete, DeletionStrategy plan) throws try { final ParsedDocument tombstone = ParsedDocument.deleteTombstone( engineConfig.getIndexSettings().seqNoIndexOptions(), + engineConfig.getIndexSettings().useDocValuesSkipper(), useTsdbSyntheticId, - delete.id() + delete.id(), + delete.uid() ); assert tombstone.docs().size() == 1 : "Tombstone doc should have single doc [" + tombstone + "]"; tombstone.updateSeqID(delete.seqNo(), delete.primaryTerm()); @@ -1869,6 +1877,7 @@ private DeleteResult deleteInLucene(Delete delete, DeletionStrategy plan) throws assert doc.getField(SeqNoFieldMapper.TOMBSTONE_NAME) != null : "Delete tombstone document but _tombstone field is not set [" + doc + " ]"; doc.add(softDeletesField); + logDocumentsDetails(List.of(doc)); if (plan.addStaleOpToLucene || plan.currentlyDeleted) { indexWriter.addDocument(doc); } else { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/ParsedDocument.java b/server/src/main/java/org/elasticsearch/index/mapper/ParsedDocument.java index 61b26ca33b1ef..17708a2d2ad82 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/ParsedDocument.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/ParsedDocument.java @@ -10,6 +10,9 @@ package org.elasticsearch.index.mapper; import org.apache.lucene.document.Field; +import org.apache.lucene.document.LongField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.bytes.BytesArray; @@ -73,7 +76,7 @@ public static ParsedDocument noopTombstone(SeqNoFieldMapper.SeqNoIndexOptions se * @param id the id of the deleted document */ public static ParsedDocument deleteTombstone(SeqNoFieldMapper.SeqNoIndexOptions seqNoIndexOptions, String id) { - return deleteTombstone(seqNoIndexOptions, false, id); + return deleteTombstone(seqNoIndexOptions, false /* ignored */, false, id, null /* ignored */); } /** @@ -82,7 +85,13 @@ public static ParsedDocument deleteTombstone(SeqNoFieldMapper.SeqNoIndexOptions * @param useSyntheticId whether the id is synthetic or not * @param id the id of the deleted document */ - public static ParsedDocument deleteTombstone(SeqNoFieldMapper.SeqNoIndexOptions seqNoIndexOptions, boolean useSyntheticId, String id) { + public static ParsedDocument deleteTombstone( + SeqNoFieldMapper.SeqNoIndexOptions seqNoIndexOptions, + boolean useDocValuesSkipper, + boolean useSyntheticId, + String id, + BytesRef uid + ) { LuceneDocument document = new LuceneDocument(); SeqNoFieldMapper.SequenceIDFields seqIdFields = SeqNoFieldMapper.SequenceIDFields.tombstone(seqNoIndexOptions); seqIdFields.addFields(document); @@ -91,7 +100,21 @@ public static ParsedDocument deleteTombstone(SeqNoFieldMapper.SeqNoIndexOptions if (useSyntheticId) { // Use a synthetic _id field which is not indexed nor stored document.add(IdFieldMapper.syntheticIdField(id)); - // TODO I think we also need to add the fields that compose the synthetic _id. + + var timeSeriesId = TsidExtractingIdFieldMapper.extractTimeSeriesIdFromSyntheticId(uid.bytes); + var timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(uid.bytes); + var routingHash = TsidExtractingIdFieldMapper.extractRoutingHashBytesFromSyntheticId(uid.bytes); + + if (useDocValuesSkipper) { + document.add(SortedDocValuesField.indexedField(TimeSeriesIdFieldMapper.NAME, timeSeriesId)); + document.add(SortedNumericDocValuesField.indexedField("@timestamp", timestamp)); + } else { + document.add(new SortedDocValuesField(TimeSeriesIdFieldMapper.NAME, timeSeriesId)); + document.add(new LongField("@timestamp", timestamp, Field.Store.NO)); + } + var field = new SortedDocValuesField(TimeSeriesRoutingHashFieldMapper.NAME, routingHash); + document.add(field); + } else { // Use standard _id field (indexed and stored, some indices also trim the stored field at some point) document.add(IdFieldMapper.standardIdField(id)); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java index 1bb7001b29890..07605b0603f82 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java @@ -67,7 +67,11 @@ public static BytesRef createField(DocumentParserContext context, RoutingHashBui || id.equals(indexRouting.createId(context.sourceToParse().getXContentType(), context.sourceToParse().source(), suffix)); } else if (context.sourceToParse().routing() != null) { int routingHash = TimeSeriesRoutingHashFieldMapper.decode(context.sourceToParse().routing()); - id = createId(routingHash, tsid, timestamp); + if (context.indexSettings().useTsdbSyntheticId()) { + id = createSyntheticId(tsid, timestamp, routingHash); + } else { + id = createId(routingHash, tsid, timestamp); + } } else { if (context.sourceToParse().id() == null) { throw new IllegalArgumentException( @@ -118,6 +122,13 @@ public static String createId(int routingHash, BytesRef tsid, long timestamp) { return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(bytes); } + public static long extractTimestampFromId(byte[] id) { + assert id.length == 20; + // id format: [4 bytes (basic hash routing fields), 8 bytes prefix of 128 murmurhash dimension fields, 8 bytes + // @timestamp) + return ByteUtils.readLongBE(id, 12); + } + public static String createId( boolean dynamicMappersExists, RoutingHashBuilder routingBuilder, @@ -141,6 +152,49 @@ public static String createId( return id; } + public static BytesRef createSyntheticIdBytesRef(BytesRef tsid, long timestamp, int routingHash) { + // A synthetic _id is the concatenation of [_tsid (non-fixed length) + timestamp (8 bytes) + routing hash (4 bytes)]. + // We dont' use hashing here because we need to be able to extract the concatenated values from the _id in various places, like + // when applying doc values updates in Lucene, or when routing GET or DELETE requests to the corresponding shard, or when replaying + // translog operations. Since the synthetic _id is not indexed and not really stored on disk we consider it fine if it is longer + // that standard ids. + byte[] bytes = new byte[tsid.length + Long.BYTES + Integer.BYTES]; + System.arraycopy(tsid.bytes, 0, bytes, 0, tsid.length); + ByteUtils.writeLongBE(timestamp, bytes, tsid.length); + ByteUtils.writeIntBE(routingHash, bytes, tsid.length + Long.BYTES); + return new BytesRef(bytes); + } + + public static String createSyntheticId(BytesRef tsid, long timestamp, int routingHash) { + BytesRef id = createSyntheticIdBytesRef(tsid, timestamp, routingHash); + return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(id.bytes); + } + + public static BytesRef extractTimeSeriesIdFromSyntheticId(byte[] id) { + assert id.length > Long.BYTES + Integer.BYTES; + // See #createSyntheticId + byte[] tsId = new byte[Math.toIntExact(id.length - Long.BYTES - Integer.BYTES)]; + System.arraycopy(id, 0, tsId, 0, tsId.length); + return new BytesRef(tsId); + } + + public static long extractTimestampFromSyntheticId(byte[] id) { + assert id.length > Long.BYTES + Integer.BYTES; + // See #createSyntheticId + return ByteUtils.readLongBE(id, id.length - Long.BYTES - Integer.BYTES); + } + + public static int extractRoutingHashFromSyntheticId(byte[] id) { + assert id.length > Long.BYTES + Integer.BYTES; + // See #createSyntheticId + return ByteUtils.readIntBE(id, id.length - Integer.BYTES); + } + + public static BytesRef extractRoutingHashBytesFromSyntheticId(byte[] id) { + int hash = extractRoutingHashFromSyntheticId(id); + return Uid.encodeId(TimeSeriesRoutingHashFieldMapper.encode(hash)); + } + @Override public String documentDescription(DocumentParserContext context) { /* From 9a9df492eb837d9ed7d3eed782308fe56a846279 Mon Sep 17 00:00:00 2001 From: tlrx Date: Mon, 27 Oct 2025 18:03:36 +0100 Subject: [PATCH 02/20] fix bug --- .../datastreams/TSDBSyntheticIdsIT.java | 97 +++++++++++-------- .../cluster/routing/IndexRouting.java | 4 +- .../lucene/uid/VersionsAndSeqNoResolver.java | 5 +- .../elasticsearch/index/IndexSortConfig.java | 1 + .../tsdb/TSDBSyntheticIdFieldsProducer.java | 11 ++- .../index/engine/InternalEngine.java | 10 +- .../elasticsearch/index/mapper/IdLoader.java | 18 +++- .../index/mapper/ParsedDocument.java | 6 +- .../mapper/TsidExtractingIdFieldMapper.java | 14 +-- .../search/DefaultSearchContext.java | 2 +- .../index/mapper/IdLoaderTests.java | 6 +- 11 files changed, 105 insertions(+), 69 deletions(-) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index 81e99d154060d..9d672b2c84ccc 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -22,10 +22,13 @@ import org.elasticsearch.cluster.metadata.Template; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.time.DateFormatter; +import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.mapper.IdFieldMapper; +import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.InternalSettingsPlugin; import org.elasticsearch.test.junit.annotations.TestLogging; @@ -45,8 +48,10 @@ import static org.elasticsearch.common.time.FormatNames.STRICT_DATE_OPTIONAL_TIME; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertCheckedResponse; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertHitCount; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; +import static org.hamcrest.Matchers.arrayWithSize; import static org.hamcrest.Matchers.containsString; import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.notNullValue; @@ -101,7 +106,7 @@ public void testInvalidIndexMode() { public void testSyntheticId() throws Exception { assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG); final var dataStreamName = randomIdentifier(); - putDataStreamTemplate(dataStreamName, randomIntBetween(1, 3)); + putDataStreamTemplate(dataStreamName, 1); final var docs = new HashMap(); final var unit = randomFrom(ChronoUnit.SECONDS, ChronoUnit.MINUTES); @@ -160,7 +165,7 @@ enum Operation { var randomDocs = randomSubsetOf(randomIntBetween(0, results.length), results); for (var doc : randomDocs) { boolean fetchSource = randomBoolean(); - var getResponse = client().prepareGet(doc.getIndex(), doc.getId()).setFetchSource(fetchSource).execute().actionGet(); + var getResponse = client().prepareGet(doc.getIndex(), doc.getId()).setFetchSource(fetchSource).get(); assertThat(getResponse.isExists(), equalTo(true)); assertThat(getResponse.getVersion(), equalTo(1L)); @@ -170,20 +175,6 @@ enum Operation { } } - // Update by synthetic _id - // - // Note: it doesn't work, is that expected? Is is blocked by IndexRouting.ExtractFromSource.updateShard - var updateDocId = randomFrom(docs.keySet()); - var updateDocIndex = docs.get(updateDocId); - var exception = expectThrows(IllegalArgumentException.class, () -> { - var doc = document(timestamp, "vm-dev01", "cpu-load", 10); // update - client().prepareUpdate(updateDocIndex, updateDocId).setDoc(doc).get(); - }); - assertThat( - exception.getMessage(), - containsString("update is not supported because the destination index [" + updateDocIndex + "] is in time_series mode") - ); - // Random flush or refresh or nothing, so that the next DELETEs are executed on flushed segments or in memory segments. switch (randomFrom(Operation.values())) { case FLUSH: @@ -199,45 +190,73 @@ enum Operation { // Delete by synthetic _id var deletedDocs = randomSubsetOf(randomIntBetween(1, docs.size()), docs.keySet()); - for (var deletedDocId : deletedDocs) { - var deletedDocIndex = docs.get(deletedDocId); + for (var docId : deletedDocs) { + var deletedDocIndex = docs.get(docId); + assertThat(deletedDocIndex, notNullValue()); // Delete - var deleteResponse = client().prepareDelete(deletedDocIndex, deletedDocId).get(); - assertThat(deleteResponse.getId(), equalTo(deletedDocId)); + var deleteResponse = client().prepareDelete(deletedDocIndex, docId).get(); + assertThat(deleteResponse.getId(), equalTo(docId)); assertThat(deleteResponse.getIndex(), equalTo(deletedDocIndex)); assertThat(deleteResponse.getResult(), equalTo(DocWriteResponse.Result.DELETED)); assertThat(deleteResponse.getVersion(), equalTo(2L)); - - // Get returns "not found" - var getResponse = client().prepareGet(deletedDocIndex, deletedDocId).get(); - assertThat(getResponse.getId(), equalTo(deletedDocId)); - assertThat(getResponse.getIndex(), equalTo(deletedDocIndex)); - assertThat(getResponse.isExists(), equalTo(false)); } - flushAndRefresh(dataStreamName); + refresh(dataStreamName); - // Check that synthetic _id field have no postings on disk - var indices = new HashSet<>(docs.values()); - for (var index : indices) { - var diskUsage = diskUsage(index); - var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME); - assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); - } - - /* This does not work :-( assertCheckedResponse( - client().prepareSearch(dataStreamName).setTrackTotalHits(true), + client().prepareSearch(dataStreamName).setTrackTotalHits(true).setSize(100), searchResponse -> { assertHitCount(searchResponse, docs.size() - deletedDocs.size()); // Verify that search response does not contain deleted docs for (var searchHit : searchResponse.getHits()) { - assertThat(deletedDocs.contains(searchHit.getId()), equalTo(false)); + assertThat( + "Document with id [" + searchHit.getId() + "] is deleted", + deletedDocs.contains(searchHit.getId()), + equalTo(false) + ); } } - );*/ + ); + + // Search by synthetic _id + var otherDocs = randomSubsetOf(Sets.difference(docs.keySet(), Sets.newHashSet(deletedDocs))); + for (var docId : otherDocs) { + assertCheckedResponse( + client().prepareSearch(docs.get(docId)) + .setSource(new SearchSourceBuilder().query(new TermQueryBuilder(IdFieldMapper.NAME, docId))), + searchResponse -> { + assertHitCount(searchResponse, 1L); + assertThat(searchResponse.getHits().getHits(), arrayWithSize(1)); + assertThat(searchResponse.getHits().getHits()[0].getId(), equalTo(docId)); + } + ); + } + + // Update by synthetic _id + // + // Note: it doesn't work, is that expected? Is is blocked by IndexRouting.ExtractFromSource.updateShard + var updateDocId = randomFrom(docs.keySet()); + var updateDocIndex = docs.get(updateDocId); + var exception = expectThrows(IllegalArgumentException.class, () -> { + var doc = document(timestamp, "vm-dev01", "cpu-load", 10); // update + client().prepareUpdate(updateDocIndex, updateDocId).setDoc(doc).get(); + }); + assertThat( + exception.getMessage(), + containsString("update is not supported because the destination index [" + updateDocIndex + "] is in time_series mode") + ); + + flush(dataStreamName); + + // Check that synthetic _id field have no postings on disk + var indices = new HashSet<>(docs.values()); + for (var index : indices) { + var diskUsage = diskUsage(index); + var diskUsageIdField = AnalyzeIndexDiskUsageTestUtils.getPerFieldDiskUsage(diskUsage, IdFieldMapper.NAME); + assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); + } } public void testGetFromTranslogBySyntheticId() throws Exception { diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java index 3c1a7c8fcbaed..9eccb663a5362 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java @@ -397,7 +397,6 @@ public int updateShard(String id, @Nullable String routing) { public int deleteShard(String id, @Nullable String routing) { checkNoRouting(routing); int shardId = idToHash(id); - System.out.println("id " + id + " routed to " + shardId); return rerouteWritesIfResharding(shardId); } @@ -431,7 +430,7 @@ private int idToHash(String id) { hash = ByteUtils.readIntLE(idBytes, idBytes.length - 9); } else if (useTimeSeriesSyntheticId) { // For TSDB with synthetic ids, the hash is stored as the id suffix. - hash = TsidExtractingIdFieldMapper.extractRoutingHashFromSyntheticId(idBytes); + hash = TsidExtractingIdFieldMapper.extractRoutingHashFromSyntheticId(new BytesRef(idBytes)); } else { // For TSDB, the hash is stored as the id prefix. hash = ByteUtils.readIntLE(idBytes, 0); @@ -526,7 +525,6 @@ public static class ForIndexDimensions extends ExtractFromSource { @Override protected int hashSource(IndexRequest indexRequest) { - System.out.println("hashSource for tsid"); BytesRef tsid = indexRequest.tsid(); if (tsid == null) { tsid = buildTsid(indexRequest.getContentType(), indexRequest.indexSource().bytes()); diff --git a/server/src/main/java/org/elasticsearch/common/lucene/uid/VersionsAndSeqNoResolver.java b/server/src/main/java/org/elasticsearch/common/lucene/uid/VersionsAndSeqNoResolver.java index b5485f3cbf1f9..5307d5d933421 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/uid/VersionsAndSeqNoResolver.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/uid/VersionsAndSeqNoResolver.java @@ -169,11 +169,12 @@ public static DocIdAndVersion timeSeriesLoadDocIdAndVersion( boolean loadSeqNo, boolean useSyntheticId ) throws IOException { - byte[] idAsBytes = Base64.getUrlDecoder().decode(id); final long timestamp; if (useSyntheticId) { - timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(idAsBytes); + assert uid.equals(new BytesRef(Base64.getUrlDecoder().decode(id))); + timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(uid); } else { + byte[] idAsBytes = Base64.getUrlDecoder().decode(id); timestamp = TsidExtractingIdFieldMapper.extractTimestampFromId(idAsBytes); } PerThreadIDVersionAndSeqNoLookup[] lookups = getLookupState(reader, true); diff --git a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java index 8235b94b4c3a9..ac2a6652f824d 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java @@ -22,6 +22,7 @@ import org.elasticsearch.index.mapper.DataStreamTimestampFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.TimeSeriesIdFieldMapper; +import org.elasticsearch.index.mapper.TimeSeriesRoutingHashFieldMapper; import org.elasticsearch.search.MultiValueMode; import org.elasticsearch.search.lookup.SearchLookup; import org.elasticsearch.search.sort.SortOrder; diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java index 2b70d7a9adfef..8cfb917e3cdff 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java @@ -427,7 +427,7 @@ public SeekStatus seekCeil(BytesRef id) throws IOException { } // Extract the _tsid - final BytesRef tsId = TsidExtractingIdFieldMapper.extractTimeSeriesIdFromSyntheticId(id.bytes); + final BytesRef tsId = TsidExtractingIdFieldMapper.extractTimeSeriesIdFromSyntheticId(id); int tsIdOrd = docValues.lookupTsIdTerm(tsId); // _tsid not found @@ -453,7 +453,9 @@ public SeekStatus seekCeil(BytesRef id) throws IOException { } // _tsid found, extract the timestamp - final long timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(id.bytes); + final long timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(id); + + Ici on doit chercher après le dernier doc. // Slow scan to the first document matching the _tsid final int startDocID = docValues.slowScanToFirstDocWithTsIdOrdinalEqualTo(tsIdOrd); @@ -481,6 +483,11 @@ public SeekStatus seekCeil(BytesRef id) throws IOException { } } + if (docID == maxDocs -1) { + current = NO_MORE_DOCS; + return SeekStatus.END; + } + // set the terms enum on the first non-matching document current = new SyntheticTerm( docID, diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java index 7a114b762f952..a97cfca88e253 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java @@ -1448,7 +1448,7 @@ private IndexResult indexIntoLucene(Index index, IndexingStrategy plan) throws I index.parsedDoc().updateSeqID(index.seqNo(), index.primaryTerm()); index.parsedDoc().version().setLongValue(plan.versionForIndexing); try { - logDocumentsDetails(index.docs()); + logDocumentsDetails(index.docs(), index.id(), index.uid()); if (plan.addStaleOpToLucene) { addStaleDocs(index.docs(), indexWriter); } else if (plan.useLuceneUpdateDocument) { @@ -1484,10 +1484,10 @@ && treatDocumentFailureAsTragicError(index) == false) { } } - private void logDocumentsDetails(List docs) { + private void logDocumentsDetails(List docs, String id, BytesRef uid) { if (useTsdbSyntheticId && logger.isTraceEnabled()) { for (var doc : docs) { - logger.trace("indexing document fields [{}]", doc.getFields()); + logger.trace("indexing document [id: {}, uid: {}]:\n{}\r\n", id, uid, doc.getFields()); } } } @@ -1877,7 +1877,7 @@ private DeleteResult deleteInLucene(Delete delete, DeletionStrategy plan) throws assert doc.getField(SeqNoFieldMapper.TOMBSTONE_NAME) != null : "Delete tombstone document but _tombstone field is not set [" + doc + " ]"; doc.add(softDeletesField); - logDocumentsDetails(List.of(doc)); + logDocumentsDetails(List.of(doc), delete.id(), delete.uid()); if (plan.addStaleOpToLucene || plan.currentlyDeleted) { indexWriter.addDocument(doc); } else { @@ -2815,7 +2815,7 @@ private IndexWriterConfig getIndexWriterConfig() { new SoftDeletesRetentionMergePolicy( Lucene.SOFT_DELETES_FIELD, () -> softDeletesPolicy.getRetentionQuery(engineConfig.getIndexSettings().seqNoIndexOptions()), - new PrunePostingsMergePolicy(mergePolicy, IdFieldMapper.NAME) + useTsdbSyntheticId ? mergePolicy : new PrunePostingsMergePolicy(mergePolicy, IdFieldMapper.NAME) ) ); if (SHUFFLE_FORCE_MERGE) { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/IdLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/IdLoader.java index 9ceae1c750733..30407f8b4645f 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/IdLoader.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/IdLoader.java @@ -38,8 +38,12 @@ static IdLoader fromLeafStoredFieldLoader() { /** * @return returns an {@link IdLoader} instance that syn synthesizes _id from routing, _tsid and @timestamp fields. */ - static IdLoader createTsIdLoader(IndexRouting.ExtractFromSource.ForRoutingPath indexRouting, List routingPaths) { - return new TsIdLoader(indexRouting, routingPaths); + static IdLoader createTsIdLoader( + IndexRouting.ExtractFromSource.ForRoutingPath indexRouting, + List routingPaths, + boolean useSyntheticId + ) { + return new TsIdLoader(indexRouting, routingPaths, useSyntheticId); } Leaf leaf(LeafStoredFieldLoader loader, LeafReader reader, int[] docIdsInLeaf) throws IOException; @@ -61,10 +65,12 @@ final class TsIdLoader implements IdLoader { private final IndexRouting.ExtractFromSource.ForRoutingPath indexRouting; private final List routingPaths; + private final boolean useSyntheticId; - TsIdLoader(IndexRouting.ExtractFromSource.ForRoutingPath indexRouting, List routingPaths) { + TsIdLoader(IndexRouting.ExtractFromSource.ForRoutingPath indexRouting, List routingPaths, boolean useSyntheticId) { this.routingPaths = routingPaths; this.indexRouting = indexRouting; + this.useSyntheticId = useSyntheticId; } public IdLoader.Leaf leaf(LeafStoredFieldLoader loader, LeafReader reader, int[] docIdsInLeaf) throws IOException { @@ -119,7 +125,11 @@ public IdLoader.Leaf leaf(LeafStoredFieldLoader loader, LeafReader reader, int[] int routingHash = TimeSeriesRoutingHashFieldMapper.decode( Uid.decodeId(routingHashBytes.bytes, routingHashBytes.offset, routingHashBytes.length) ); - ids[i] = TsidExtractingIdFieldMapper.createId(routingHash, tsid, timestamp); + if (useSyntheticId) { + ids[i] = TsidExtractingIdFieldMapper.createSyntheticId(tsid, timestamp, routingHash); + } else { + ids[i] = TsidExtractingIdFieldMapper.createId(routingHash, tsid, timestamp); + } } } return new TsIdLeaf(docIdsInLeaf, ids); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/ParsedDocument.java b/server/src/main/java/org/elasticsearch/index/mapper/ParsedDocument.java index 17708a2d2ad82..ef56ad180e32f 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/ParsedDocument.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/ParsedDocument.java @@ -101,9 +101,9 @@ public static ParsedDocument deleteTombstone( // Use a synthetic _id field which is not indexed nor stored document.add(IdFieldMapper.syntheticIdField(id)); - var timeSeriesId = TsidExtractingIdFieldMapper.extractTimeSeriesIdFromSyntheticId(uid.bytes); - var timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(uid.bytes); - var routingHash = TsidExtractingIdFieldMapper.extractRoutingHashBytesFromSyntheticId(uid.bytes); + var timeSeriesId = TsidExtractingIdFieldMapper.extractTimeSeriesIdFromSyntheticId(uid); + var timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(uid); + var routingHash = TsidExtractingIdFieldMapper.extractRoutingHashBytesFromSyntheticId(uid); if (useDocValuesSkipper) { document.add(SortedDocValuesField.indexedField(TimeSeriesIdFieldMapper.NAME, timeSeriesId)); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java index 07605b0603f82..dba74d69fa8fb 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java @@ -170,27 +170,27 @@ public static String createSyntheticId(BytesRef tsid, long timestamp, int routin return Strings.BASE_64_NO_PADDING_URL_ENCODER.encodeToString(id.bytes); } - public static BytesRef extractTimeSeriesIdFromSyntheticId(byte[] id) { + public static BytesRef extractTimeSeriesIdFromSyntheticId(BytesRef id) { assert id.length > Long.BYTES + Integer.BYTES; // See #createSyntheticId byte[] tsId = new byte[Math.toIntExact(id.length - Long.BYTES - Integer.BYTES)]; - System.arraycopy(id, 0, tsId, 0, tsId.length); + System.arraycopy(id.bytes, id.offset, tsId, 0, tsId.length); return new BytesRef(tsId); } - public static long extractTimestampFromSyntheticId(byte[] id) { + public static long extractTimestampFromSyntheticId(BytesRef id) { assert id.length > Long.BYTES + Integer.BYTES; // See #createSyntheticId - return ByteUtils.readLongBE(id, id.length - Long.BYTES - Integer.BYTES); + return ByteUtils.readLongBE(id.bytes, id.offset + id.length - Long.BYTES - Integer.BYTES); } - public static int extractRoutingHashFromSyntheticId(byte[] id) { + public static int extractRoutingHashFromSyntheticId(BytesRef id) { assert id.length > Long.BYTES + Integer.BYTES; // See #createSyntheticId - return ByteUtils.readIntBE(id, id.length - Integer.BYTES); + return ByteUtils.readIntBE(id.bytes, id.offset + id.length - Integer.BYTES); } - public static BytesRef extractRoutingHashBytesFromSyntheticId(byte[] id) { + public static BytesRef extractRoutingHashBytesFromSyntheticId(BytesRef id) { int hash = extractRoutingHashFromSyntheticId(id); return Uid.encodeId(TimeSeriesRoutingHashFieldMapper.encode(hash)); } diff --git a/server/src/main/java/org/elasticsearch/search/DefaultSearchContext.java b/server/src/main/java/org/elasticsearch/search/DefaultSearchContext.java index 475d2d1887563..bf95bc3ccf69f 100644 --- a/server/src/main/java/org/elasticsearch/search/DefaultSearchContext.java +++ b/server/src/main/java/org/elasticsearch/search/DefaultSearchContext.java @@ -971,7 +971,7 @@ public IdLoader newIdLoader() { } } } - return IdLoader.createTsIdLoader(indexRouting, routingPaths); + return IdLoader.createTsIdLoader(indexRouting, routingPaths, indexService.getIndexSettings().useTsdbSyntheticId()); } else { return IdLoader.fromLeafStoredFieldLoader(); } diff --git a/server/src/test/java/org/elasticsearch/index/mapper/IdLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/IdLoaderTests.java index 083efccceec16..d75a013eead28 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/IdLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/IdLoaderTests.java @@ -46,7 +46,7 @@ public class IdLoaderTests extends ESTestCase { private final int routingHash = randomInt(); public void testSynthesizeIdSimple() throws Exception { - var idLoader = IdLoader.createTsIdLoader(null, null); + var idLoader = IdLoader.createTsIdLoader(null, null, indexService.getIndexSettings().useTsdbSyntheticId()); long startTime = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2023-01-01T00:00:00Z"); List docs = List.of( @@ -68,7 +68,7 @@ public void testSynthesizeIdSimple() throws Exception { } public void testSynthesizeIdMultipleSegments() throws Exception { - var idLoader = IdLoader.createTsIdLoader(null, null); + var idLoader = IdLoader.createTsIdLoader(null, null, indexService.getIndexSettings().useTsdbSyntheticId()); long startTime = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2023-01-01T00:00:00Z"); List docs1 = List.of( @@ -138,7 +138,7 @@ public void testSynthesizeIdMultipleSegments() throws Exception { } public void testSynthesizeIdRandom() throws Exception { - var idLoader = IdLoader.createTsIdLoader(null, null); + var idLoader = IdLoader.createTsIdLoader(null, null, indexService.getIndexSettings().useTsdbSyntheticId()); long startTime = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2023-01-01T00:00:00Z"); Set expectedIDs = new HashSet<>(); From f6234c34e19358d9afde1fe45927ba483958f782 Mon Sep 17 00:00:00 2001 From: tlrx Date: Tue, 28 Oct 2025 10:16:14 +0100 Subject: [PATCH 03/20] fix remaining bug --- .../datastreams/TSDBSyntheticIdsIT.java | 14 +++++----- .../tsdb/TSDBSyntheticIdFieldsProducer.java | 27 +++++++------------ 2 files changed, 16 insertions(+), 25 deletions(-) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index 9d672b2c84ccc..1d62e2efe52b5 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -261,8 +261,8 @@ enum Operation { public void testGetFromTranslogBySyntheticId() throws Exception { assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG); - final var datastreamName = randomIdentifier(); - putDataStreamTemplate(datastreamName, 1); + final var dataStreamName = randomIdentifier(); + putDataStreamTemplate(dataStreamName, 1); final var docs = new HashMap(); final var unit = randomFrom(ChronoUnit.SECONDS, ChronoUnit.MINUTES); @@ -272,7 +272,7 @@ public void testGetFromTranslogBySyntheticId() throws Exception { // // For convenience, the metric value maps the index in the bulk response items var results = createDocuments( - datastreamName, + dataStreamName, // t + 0s document(timestamp, "vm-dev01", "cpu-load", 0), document(timestamp, "vm-dev02", "cpu-load", 1), @@ -295,7 +295,7 @@ public void testGetFromTranslogBySyntheticId() throws Exception { // The documents are in memory buffers: the first GET will trigger the refresh of the internal reader // (see InternalEngine.REAL_TIME_GET_REFRESH_SOURCE) to have an up-to-date searcher to resolve documents ids and versions. It will // also enable the tracking of the locations of documents in the translog (see InternalEngine.trackTranslogLocation) so that next - // GETs will be resolved with the translog. + // GETs will be resolved using the translog. var randomDocs = randomSubsetOf(randomIntBetween(1, results.length), results); for (var doc : randomDocs) { var getResponse = client().prepareGet(doc.getIndex(), doc.getId()).setRealtime(true).setFetchSource(true).execute().actionGet(); @@ -310,7 +310,7 @@ public void testGetFromTranslogBySyntheticId() throws Exception { // Index 5 more docs results = createDocuments( - datastreamName, + dataStreamName, // t + 2s document(timestamp.plus(2, unit), "vm-dev01", "cpu-load", metricOffset), document(timestamp.plus(2, unit), "vm-dev02", "cpu-load", metricOffset + 1), @@ -342,7 +342,7 @@ public void testGetFromTranslogBySyntheticId() throws Exception { assertThat(asInstanceOf(Integer.class, source.get("value")), equalTo(metricOffset + doc.getItemId())); } - flushAndRefresh(datastreamName); + flushAndRefresh(dataStreamName); // Check that synthetic _id field have no postings on disk var indices = new HashSet<>(docs.values()); @@ -352,7 +352,7 @@ public void testGetFromTranslogBySyntheticId() throws Exception { assertThat("_id field should not have postings on disk", diskUsageIdField.getInvertedIndexBytes(), equalTo(0L)); } - assertHitCount(client().prepareSearch(datastreamName).setSize(0), 10L); + assertHitCount(client().prepareSearch(dataStreamName).setSize(0), 10L); } private static XContentBuilder document(Instant timestamp, String hostName, String metricField, Integer metricValue) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java index 8cfb917e3cdff..29540f61de1bb 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java @@ -37,6 +37,10 @@ import static org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdPostingsFormat.TIMESTAMP; import static org.elasticsearch.index.codec.tsdb.TSDBSyntheticIdPostingsFormat.TS_ID; +/** + * Produces synthetic _id terms that are computed at runtime from the doc values of other fields like _tsid, @timestamp and + * _ts_routing_hash. + */ public class TSDBSyntheticIdFieldsProducer extends FieldsProducer { private static final Set FIELDS_NAMES = Set.of(SYNTHETIC_ID); @@ -402,6 +406,7 @@ public BytesRef next() throws IOException { if (current == NO_MORE_DOCS) { return null; } + int docID = (current != null) ? current.docID + 1 : 0; if (maxDocs <= docID) { current = NO_MORE_DOCS; @@ -420,6 +425,7 @@ public BytesRef next() throws IOException { @Override public SeekStatus seekCeil(BytesRef id) throws IOException { + assert id != null; assert Long.BYTES + Integer.BYTES < id.length : id.length; if (id == null || id.length <= Long.BYTES + Integer.BYTES) { @@ -455,15 +461,13 @@ public SeekStatus seekCeil(BytesRef id) throws IOException { // _tsid found, extract the timestamp final long timestamp = TsidExtractingIdFieldMapper.extractTimestampFromSyntheticId(id); - Ici on doit chercher après le dernier doc. - // Slow scan to the first document matching the _tsid final int startDocID = docValues.slowScanToFirstDocWithTsIdOrdinalEqualTo(tsIdOrd); assert 0 <= startDocID : startDocID; int docID = startDocID; int docTsIdOrd = tsIdOrd; - long docTimestamp = -1; + long docTimestamp; // Iterate over documents to find the first one matching the timestamp for (; docID < maxDocs; docID++) { @@ -482,21 +486,8 @@ public SeekStatus seekCeil(BytesRef id) throws IOException { break; } } - - if (docID == maxDocs -1) { - current = NO_MORE_DOCS; - return SeekStatus.END; - } - - // set the terms enum on the first non-matching document - current = new SyntheticTerm( - docID, - docTsIdOrd, - docValues.lookupTsIdOrd(docTsIdOrd), - docTimestamp, - docValues.docRoutingHash(docID) - ); - return SeekStatus.NOT_FOUND; + current = NO_MORE_DOCS; + return SeekStatus.END; } @Override From 51d66a312726c4db8cc2d799d54fe658d6d65407 Mon Sep 17 00:00:00 2001 From: tlrx Date: Tue, 28 Oct 2025 16:35:56 +0100 Subject: [PATCH 04/20] fix sorting --- .../datastreams/TSDBSyntheticIdsIT.java | 29 ++++++------- .../org/elasticsearch/index/IndexMode.java | 42 +++++++++++-------- .../elasticsearch/index/IndexSortConfig.java | 13 +++++- .../tsdb/TSDBSyntheticIdFieldsProducer.java | 4 +- .../mapper/TsidExtractingIdFieldMapper.java | 8 +++- 5 files changed, 56 insertions(+), 40 deletions(-) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index 1d62e2efe52b5..ca9b3c9a33ef3 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -31,7 +31,6 @@ import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.test.ESIntegTestCase; import org.elasticsearch.test.InternalSettingsPlugin; -import org.elasticsearch.test.junit.annotations.TestLogging; import org.elasticsearch.xcontent.XContentBuilder; import org.elasticsearch.xcontent.XContentFactory; @@ -102,11 +101,10 @@ public void testInvalidIndexMode() { ); } - @TestLogging(reason = "debug", value = "org.elasticsearch.index.engine.Engine:TRACE") public void testSyntheticId() throws Exception { assumeTrue("Test should only run with feature flag", IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG); final var dataStreamName = randomIdentifier(); - putDataStreamTemplate(dataStreamName, 1); + putDataStreamTemplate(dataStreamName, randomIntBetween(1, 5)); final var docs = new HashMap(); final var unit = randomFrom(ChronoUnit.SECONDS, ChronoUnit.MINUTES); @@ -204,21 +202,18 @@ enum Operation { refresh(dataStreamName); - assertCheckedResponse( - client().prepareSearch(dataStreamName).setTrackTotalHits(true).setSize(100), - searchResponse -> { - assertHitCount(searchResponse, docs.size() - deletedDocs.size()); - - // Verify that search response does not contain deleted docs - for (var searchHit : searchResponse.getHits()) { - assertThat( - "Document with id [" + searchHit.getId() + "] is deleted", - deletedDocs.contains(searchHit.getId()), - equalTo(false) - ); - } + assertCheckedResponse(client().prepareSearch(dataStreamName).setTrackTotalHits(true).setSize(100), searchResponse -> { + assertHitCount(searchResponse, docs.size() - deletedDocs.size()); + + // Verify that search response does not contain deleted docs + for (var searchHit : searchResponse.getHits()) { + assertThat( + "Document with id [" + searchHit.getId() + "] is deleted", + deletedDocs.contains(searchHit.getId()), + equalTo(false) + ); } - ); + }); // Search by synthetic _id var otherDocs = randomSubsetOf(Sets.difference(docs.keySet(), Sets.newHashSet(deletedDocs))); diff --git a/server/src/main/java/org/elasticsearch/index/IndexMode.java b/server/src/main/java/org/elasticsearch/index/IndexMode.java index 10e604126f934..726610efb927d 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexMode.java +++ b/server/src/main/java/org/elasticsearch/index/IndexMode.java @@ -42,15 +42,13 @@ import java.io.IOException; import java.time.Instant; import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.function.BooleanSupplier; import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static java.util.stream.Collectors.toSet; /** * "Mode" that controls which behaviors and settings an index supports. @@ -142,6 +140,14 @@ void validateWithOtherSettings(Map, Object> settings) { } var settingsWithIndexMode = Settings.builder().put(IndexSettings.MODE.getKey(), getName()).build(); + if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { + settingsWithIndexMode = Settings.builder() + .put(IndexSettings.MODE.getKey(), getName()) + .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), "true") + .build(); + } else { + settingsWithIndexMode = Settings.builder().put(IndexSettings.MODE.getKey(), getName()).build(); + } for (Setting unsupported : TIME_SERIES_UNSUPPORTED) { if (false == Objects.equals(unsupported.getDefault(settingsWithIndexMode), settings.get(unsupported))) { @@ -460,20 +466,22 @@ private static CompressedXContent createDefaultMapping(boolean includeHostName) IndexSortConfig.INDEX_SORT_MISSING_SETTING ); - static final List> VALIDATE_WITH_SETTINGS = List.copyOf( - Stream.concat( - Stream.of( - IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING, - IndexMetadata.INDEX_ROUTING_PARTITION_SIZE_SETTING, - IndexMetadata.INDEX_ROUTING_PATH, - IndexMetadata.INDEX_DIMENSIONS, - IndexSettings.LOGSDB_ROUTE_ON_SORT_FIELDS, - IndexSettings.TIME_SERIES_START_TIME, - IndexSettings.TIME_SERIES_END_TIME - ), - TIME_SERIES_UNSUPPORTED.stream() - ).collect(toSet()) - ); + static final List> VALIDATE_WITH_SETTINGS; + static { + var settings = new HashSet>(); + settings.add(IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING); + settings.add(IndexMetadata.INDEX_ROUTING_PARTITION_SIZE_SETTING); + settings.add(IndexMetadata.INDEX_ROUTING_PATH); + settings.add(IndexMetadata.INDEX_DIMENSIONS); + settings.add(IndexSettings.LOGSDB_ROUTE_ON_SORT_FIELDS); + settings.add(IndexSettings.TIME_SERIES_START_TIME); + settings.add(IndexSettings.TIME_SERIES_END_TIME); + if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { + settings.add(IndexSettings.USE_SYNTHETIC_ID); + } + settings.addAll(TIME_SERIES_UNSUPPORTED); + VALIDATE_WITH_SETTINGS = List.copyOf(settings); + } private final String name; diff --git a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java index ac2a6652f824d..69c171fd842c0 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java @@ -22,7 +22,6 @@ import org.elasticsearch.index.mapper.DataStreamTimestampFieldMapper; import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.TimeSeriesIdFieldMapper; -import org.elasticsearch.index.mapper.TimeSeriesRoutingHashFieldMapper; import org.elasticsearch.search.MultiValueMode; import org.elasticsearch.search.lookup.SearchLookup; import org.elasticsearch.search.sort.SortOrder; @@ -107,12 +106,16 @@ public final class IndexSortConfig { ); public static class IndexSortConfigDefaults { - public static final FieldSortSpec[] TIME_SERIES_SORT, TIMESTAMP_SORT, HOSTNAME_TIMESTAMP_SORT, HOSTNAME_TIMESTAMP_BWC_SORT; + public static final FieldSortSpec[] TIME_SERIES_SORT, TIME_SERIES_WITH_SYNTHETIC_ID_SORT, TIMESTAMP_SORT, HOSTNAME_TIMESTAMP_SORT, + HOSTNAME_TIMESTAMP_BWC_SORT; static { FieldSortSpec timeStampSpec = new FieldSortSpec(DataStreamTimestampFieldMapper.DEFAULT_PATH); timeStampSpec.order = SortOrder.DESC; TIME_SERIES_SORT = new FieldSortSpec[] { new FieldSortSpec(TimeSeriesIdFieldMapper.NAME), timeStampSpec }; + TIME_SERIES_WITH_SYNTHETIC_ID_SORT = new FieldSortSpec[] { + new FieldSortSpec(TimeSeriesIdFieldMapper.NAME), + new FieldSortSpec(DataStreamTimestampFieldMapper.DEFAULT_PATH) }; TIMESTAMP_SORT = new FieldSortSpec[] { timeStampSpec }; FieldSortSpec hostnameSpec = new FieldSortSpec(IndexMode.HOST_NAME); @@ -141,6 +144,12 @@ public static FieldSortSpec[] getDefaultSortSpecs(Settings settings) { } if (IndexMode.TIME_SERIES.getName().equals(indexMode)) { + if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { + var s = settings.get(IndexSettings.USE_SYNTHETIC_ID.getKey()); + if (s != null) { + return TIME_SERIES_WITH_SYNTHETIC_ID_SORT; + } + } return TIME_SERIES_SORT; } else if (IndexMode.LOGSDB.getName().equals(indexMode)) { var version = IndexMetadata.SETTING_INDEX_VERSION_CREATED.get(settings); diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java index 29540f61de1bb..71cd99c00440f 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java @@ -150,7 +150,7 @@ private static class DocValuesHolder { private final FieldInfo routingHashFieldInfo; private final DocValuesProducer docValuesProducer; - private SortedNumericDocValues timestampDocValues; // sorted desc. order + private SortedNumericDocValues timestampDocValues; // sorted asc. order private SortedDocValues routingHashDocValues; // sorted asc. order private SortedDocValues tsIdDocValues; // sorted asc. order // Keep around the latest tsId ordinal and value @@ -482,7 +482,7 @@ public SeekStatus seekCeil(BytesRef id) throws IOException { return SeekStatus.FOUND; } // Remaining docs don't match, stop here - if (tsIdOrd < docTsIdOrd || docTimestamp < timestamp) { + if (tsIdOrd < docTsIdOrd || timestamp < docTimestamp) { break; } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java index dba74d69fa8fb..64d834ef3a7bf 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java @@ -158,9 +158,13 @@ public static BytesRef createSyntheticIdBytesRef(BytesRef tsid, long timestamp, // when applying doc values updates in Lucene, or when routing GET or DELETE requests to the corresponding shard, or when replaying // translog operations. Since the synthetic _id is not indexed and not really stored on disk we consider it fine if it is longer // that standard ids. + // + // Also, when applying doc values updates Lucene expects _id to be sorted: it stops applying updates for a term "_id:ABC" if it + // seeks to a term "BCD" as it knows there won't be more documents matching "_id:ABC" past the term "BCD". So it is important to + // generate an _id that reflects the ordering of the terms it is synthesized from, ie _tsid and @timestamp. byte[] bytes = new byte[tsid.length + Long.BYTES + Integer.BYTES]; - System.arraycopy(tsid.bytes, 0, bytes, 0, tsid.length); - ByteUtils.writeLongBE(timestamp, bytes, tsid.length); + System.arraycopy(tsid.bytes, tsid.offset, bytes, 0, tsid.length); + ByteUtils.writeLongBE(timestamp, bytes, tsid.length); // Big Endian as we want to most significant byte first ByteUtils.writeIntBE(routingHash, bytes, tsid.length + Long.BYTES); return new BytesRef(bytes); } From 8cfa2fab051d5e7604cb55ef68cf9a0340a855ff Mon Sep 17 00:00:00 2001 From: tlrx Date: Tue, 28 Oct 2025 18:00:37 +0100 Subject: [PATCH 05/20] fix compiling and tests --- .../common/lucene/uid/VersionLookupTests.java | 2 +- .../common/lucene/uid/VersionsTests.java | 22 +++++++++--- .../index/mapper/IdLoaderTests.java | 35 +++++++++++-------- .../index/shard/IndexShardTests.java | 2 +- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/server/src/test/java/org/elasticsearch/common/lucene/uid/VersionLookupTests.java b/server/src/test/java/org/elasticsearch/common/lucene/uid/VersionLookupTests.java index 2e69987f29180..59c8a2028d98a 100644 --- a/server/src/test/java/org/elasticsearch/common/lucene/uid/VersionLookupTests.java +++ b/server/src/test/java/org/elasticsearch/common/lucene/uid/VersionLookupTests.java @@ -159,7 +159,7 @@ public void testLoadTimestampRangeWithDeleteTombstone() throws Exception { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER).setMergePolicy(NoMergePolicy.INSTANCE)); var randomSeqNoIndexOptions = randomFrom(SeqNoFieldMapper.SeqNoIndexOptions.values()); - writer.addDocument(ParsedDocument.deleteTombstone(randomSeqNoIndexOptions, false, "_id").docs().get(0)); + writer.addDocument(ParsedDocument.deleteTombstone(randomSeqNoIndexOptions, "_id").docs().get(0)); DirectoryReader reader = DirectoryReader.open(writer); LeafReaderContext segment = reader.leaves().get(0); PerThreadIDVersionAndSeqNoLookup lookup = new PerThreadIDVersionAndSeqNoLookup(segment.reader(), true); diff --git a/server/src/test/java/org/elasticsearch/common/lucene/uid/VersionsTests.java b/server/src/test/java/org/elasticsearch/common/lucene/uid/VersionsTests.java index 59c82195b7fce..141b865a8395b 100644 --- a/server/src/test/java/org/elasticsearch/common/lucene/uid/VersionsTests.java +++ b/server/src/test/java/org/elasticsearch/common/lucene/uid/VersionsTests.java @@ -200,7 +200,7 @@ public void testTimeSeriesLoadDocIdAndVersion() throws Exception { DirectoryReader directoryReader = ElasticsearchDirectoryReader.wrap(DirectoryReader.open(writer), new ShardId("foo", "_na_", 1)); String id = createTSDBId(1000L); assertThat( - VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("1"), id, randomBoolean()), + VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("1"), id, randomBoolean(), false), nullValue() ); @@ -222,11 +222,23 @@ public void testTimeSeriesLoadDocIdAndVersion() throws Exception { directoryReader = reopen(directoryReader); id = createTSDBId(randomLongBetween(1000, 10000)); - assertThat(VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("1"), id, true), notNullValue()); - assertThat(VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("2"), id, true), notNullValue()); + assertThat( + VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("1"), id, true, false), + notNullValue() + ); + assertThat( + VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("2"), id, true, false), + notNullValue() + ); id = createTSDBId(randomBoolean() ? randomLongBetween(0, 999) : randomLongBetween(10001, Long.MAX_VALUE)); - assertThat(VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("1"), id, true), nullValue()); - assertThat(VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("2"), id, true), nullValue()); + assertThat( + VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("1"), id, true, false), + nullValue() + ); + assertThat( + VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion(directoryReader, new BytesRef("2"), id, true, false), + nullValue() + ); directoryReader.close(); writer.close(); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/IdLoaderTests.java b/server/src/test/java/org/elasticsearch/index/mapper/IdLoaderTests.java index d75a013eead28..bf595392108c3 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/IdLoaderTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/IdLoaderTests.java @@ -46,7 +46,8 @@ public class IdLoaderTests extends ESTestCase { private final int routingHash = randomInt(); public void testSynthesizeIdSimple() throws Exception { - var idLoader = IdLoader.createTsIdLoader(null, null, indexService.getIndexSettings().useTsdbSyntheticId()); + final boolean useSyntheticIds = randomBoolean(); + var idLoader = IdLoader.createTsIdLoader(null, null, useSyntheticIds); long startTime = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2023-01-01T00:00:00Z"); List docs = List.of( @@ -60,15 +61,16 @@ public void testSynthesizeIdSimple() throws Exception { assertThat(leafReader.numDocs(), equalTo(3)); var leaf = idLoader.leaf(null, leafReader, new int[] { 0, 1, 2 }); // NOTE: time series data is ordered by (tsid, timestamp) - assertThat(leaf.getId(0), equalTo(expectedId(docs.get(2), routingHash))); - assertThat(leaf.getId(1), equalTo(expectedId(docs.get(0), routingHash))); - assertThat(leaf.getId(2), equalTo(expectedId(docs.get(1), routingHash))); + assertThat(leaf.getId(0), equalTo(expectedId(docs.get(2), routingHash, useSyntheticIds))); + assertThat(leaf.getId(1), equalTo(expectedId(docs.get(0), routingHash, useSyntheticIds))); + assertThat(leaf.getId(2), equalTo(expectedId(docs.get(1), routingHash, useSyntheticIds))); }; prepareIndexReader(indexAndForceMerge(docs, routingHash), verify, false); } public void testSynthesizeIdMultipleSegments() throws Exception { - var idLoader = IdLoader.createTsIdLoader(null, null, indexService.getIndexSettings().useTsdbSyntheticId()); + final boolean useSyntheticIds = randomBoolean(); + var idLoader = IdLoader.createTsIdLoader(null, null, useSyntheticIds); long startTime = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2023-01-01T00:00:00Z"); List docs1 = List.of( @@ -110,22 +112,22 @@ public void testSynthesizeIdMultipleSegments() throws Exception { assertThat(leafReader.numDocs(), equalTo(docs1.size())); var leaf = idLoader.leaf(null, leafReader, IntStream.range(0, docs1.size()).toArray()); for (int i = 0; i < docs1.size(); i++) { - assertThat(leaf.getId(i), equalTo(expectedId(docs1.get(i), routingHash))); + assertThat(leaf.getId(i), equalTo(expectedId(docs1.get(i), routingHash, useSyntheticIds))); } } { LeafReader leafReader = indexReader.leaves().get(1).reader(); assertThat(leafReader.numDocs(), equalTo(docs2.size())); var leaf = idLoader.leaf(null, leafReader, new int[] { 0, 3 }); - assertThat(leaf.getId(0), equalTo(expectedId(docs2.get(0), routingHash))); - assertThat(leaf.getId(3), equalTo(expectedId(docs2.get(3), routingHash))); + assertThat(leaf.getId(0), equalTo(expectedId(docs2.get(0), routingHash, useSyntheticIds))); + assertThat(leaf.getId(3), equalTo(expectedId(docs2.get(3), routingHash, useSyntheticIds))); } { LeafReader leafReader = indexReader.leaves().get(2).reader(); assertThat(leafReader.numDocs(), equalTo(docs3.size())); var leaf = idLoader.leaf(null, leafReader, new int[] { 1, 2 }); - assertThat(leaf.getId(1), equalTo(expectedId(docs3.get(1), routingHash))); - assertThat(leaf.getId(2), equalTo(expectedId(docs3.get(2), routingHash))); + assertThat(leaf.getId(1), equalTo(expectedId(docs3.get(1), routingHash, useSyntheticIds))); + assertThat(leaf.getId(2), equalTo(expectedId(docs3.get(2), routingHash, useSyntheticIds))); } { LeafReader leafReader = indexReader.leaves().get(2).reader(); @@ -138,7 +140,8 @@ public void testSynthesizeIdMultipleSegments() throws Exception { } public void testSynthesizeIdRandom() throws Exception { - var idLoader = IdLoader.createTsIdLoader(null, null, indexService.getIndexSettings().useTsdbSyntheticId()); + final boolean useSyntheticIds = randomBoolean(); + var idLoader = IdLoader.createTsIdLoader(null, null, useSyntheticIds); long startTime = DateFieldMapper.DEFAULT_DATE_TIME_FORMATTER.parseMillis("2023-01-01T00:00:00Z"); Set expectedIDs = new HashSet<>(); @@ -161,7 +164,7 @@ public void testSynthesizeIdRandom() throws Exception { for (int j = 0; j < numberOfSamples; j++) { Doc doc = new Doc(startTime++, dimensions); randomDocs.add(doc); - expectedIDs.add(expectedId(doc, routingHash)); + expectedIDs.add(expectedId(doc, routingHash, useSyntheticIds)); } } CheckedConsumer verify = indexReader -> { @@ -240,7 +243,7 @@ private static void indexDoc(IndexWriter iw, Doc doc, int routingHash) throws IO iw.addDocument(fields); } - private static String expectedId(Doc doc, int routingHash) throws IOException { + private static String expectedId(Doc doc, int routingHash, boolean useSyntheticIds) { var routingFields = new RoutingPathFields(null); for (Dimension dimension : doc.dimensions) { if (dimension.value instanceof Number n) { @@ -249,7 +252,11 @@ private static String expectedId(Doc doc, int routingHash) throws IOException { routingFields.addString(dimension.field, dimension.value.toString()); } } - return TsidExtractingIdFieldMapper.createId(routingHash, routingFields.buildHash().toBytesRef(), doc.timestamp); + if (useSyntheticIds) { + return TsidExtractingIdFieldMapper.createSyntheticId(routingFields.buildHash().toBytesRef(), doc.timestamp, routingHash); + } else { + return TsidExtractingIdFieldMapper.createId(routingHash, routingFields.buildHash().toBytesRef(), doc.timestamp); + } } record Doc(long timestamp, List dimensions) {} diff --git a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java index 94d3548c23793..e792a0f48553b 100644 --- a/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java +++ b/server/src/test/java/org/elasticsearch/index/shard/IndexShardTests.java @@ -4568,7 +4568,7 @@ public void testOnCloseStats() throws IOException { public void testSupplyTombstoneDoc() throws Exception { IndexShard shard = newStartedShard(); String id = randomRealisticUnicodeOfLengthBetween(1, 10); - ParsedDocument deleteTombstone = ParsedDocument.deleteTombstone(shard.indexSettings.seqNoIndexOptions(), randomBoolean(), id); + ParsedDocument deleteTombstone = ParsedDocument.deleteTombstone(shard.indexSettings.seqNoIndexOptions(), id); assertThat(deleteTombstone.docs(), hasSize(1)); LuceneDocument deleteDoc = deleteTombstone.docs().get(0); assertThat( From d885dda33aceb4f18cbab5c0ab1c3e10682b8c0a Mon Sep 17 00:00:00 2001 From: tlrx Date: Wed, 29 Oct 2025 14:00:20 +0100 Subject: [PATCH 06/20] fix sort config --- .../main/java/org/elasticsearch/index/IndexSortConfig.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java index 69c171fd842c0..e52ed3088e671 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java @@ -145,8 +145,8 @@ public static FieldSortSpec[] getDefaultSortSpecs(Settings settings) { if (IndexMode.TIME_SERIES.getName().equals(indexMode)) { if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { - var s = settings.get(IndexSettings.USE_SYNTHETIC_ID.getKey()); - if (s != null) { + var useSyntheticId = settings.get(IndexSettings.USE_SYNTHETIC_ID.getKey()); + if (useSyntheticId != null && useSyntheticId.equalsIgnoreCase(Boolean.TRUE.toString())) { return TIME_SERIES_WITH_SYNTHETIC_ID_SORT; } } From b22f59c52695c4c2745456b60c3863da1b0f05f0 Mon Sep 17 00:00:00 2001 From: tlrx Date: Wed, 29 Oct 2025 14:46:42 +0100 Subject: [PATCH 07/20] fix sort config --- server/src/main/java/org/elasticsearch/index/IndexMode.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/IndexMode.java b/server/src/main/java/org/elasticsearch/index/IndexMode.java index 726610efb927d..b3df1bfd8ea5a 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexMode.java +++ b/server/src/main/java/org/elasticsearch/index/IndexMode.java @@ -139,11 +139,12 @@ void validateWithOtherSettings(Map, Object> settings) { throw new IllegalArgumentException(error(IndexMetadata.INDEX_ROUTING_PARTITION_SIZE_SETTING)); } - var settingsWithIndexMode = Settings.builder().put(IndexSettings.MODE.getKey(), getName()).build(); + Settings settingsWithIndexMode; if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { settingsWithIndexMode = Settings.builder() .put(IndexSettings.MODE.getKey(), getName()) - .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), "true") + // Default values of some index sort settings depend of the feature flag and USE_SYNTHETIC_ID setting + .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), (Boolean) settings.get(IndexSettings.USE_SYNTHETIC_ID)) .build(); } else { settingsWithIndexMode = Settings.builder().put(IndexSettings.MODE.getKey(), getName()).build(); From b50b64ccd2f21cfe54d4ad4cb944e1c9588a6148 Mon Sep 17 00:00:00 2001 From: tlrx Date: Thu, 30 Oct 2025 12:49:26 +0100 Subject: [PATCH 08/20] fix merge --- .../main/java/org/elasticsearch/index/IndexSortConfig.java | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java index f8962c648d336..d3248c76b14db 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java @@ -147,6 +147,12 @@ public static FieldSortSpec[] getDefaultSortSpecs(Settings settings) { } if (IndexMode.TIME_SERIES.getName().equals(indexMode)) { + if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { + var useSyntheticId = settings.get(IndexSettings.USE_SYNTHETIC_ID.getKey()); + if (useSyntheticId != null && useSyntheticId.equalsIgnoreCase(Boolean.TRUE.toString())) { + return TIME_SERIES_WITH_SYNTHETIC_ID_SORT; + } + } return TIME_SERIES_SORT; } else if (IndexMode.LOGSDB.getName().equals(indexMode)) { var version = IndexMetadata.SETTING_INDEX_VERSION_CREATED.get(settings); From 933e280c0e78eed612e23f389c0d745312c6ec43 Mon Sep 17 00:00:00 2001 From: tlrx Date: Thu, 30 Oct 2025 14:51:34 +0100 Subject: [PATCH 09/20] compute useTimeSeriesSyntheticId in metadata --- .../cluster/metadata/IndexMetadata.java | 39 +++++++++++++--- .../cluster/routing/IndexRouting.java | 5 +-- .../elasticsearch/index/IndexSettings.java | 44 ++++++++++++++++--- .../index/codec/CodecService.java | 2 +- .../index/engine/InternalEngine.java | 2 +- .../mapper/TsidExtractingIdFieldMapper.java | 4 +- .../search/DefaultSearchContext.java | 2 +- 7 files changed, 76 insertions(+), 22 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java b/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java index 38d0fada7d866..6fee4b39dbe22 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java @@ -705,6 +705,8 @@ public Iterator> settings() { @Nullable private final IndexReshardingMetadata reshardingMetadata; + private final boolean useTimeSeriesSyntheticId; + private IndexMetadata( final Index index, final long version, @@ -754,7 +756,8 @@ private IndexMetadata( @Nullable final IndexMetadataStats stats, @Nullable final Double writeLoadForecast, @Nullable Long shardSizeInBytesForecast, - @Nullable IndexReshardingMetadata reshardingMetadata + @Nullable IndexReshardingMetadata reshardingMetadata, + final boolean useTimeSeriesSyntheticId ) { this.index = index; this.version = version; @@ -815,6 +818,7 @@ private IndexMetadata( this.shardSizeInBytesForecast = shardSizeInBytesForecast; assert numberOfShards * routingFactor == routingNumShards : routingNumShards + " must be a multiple of " + numberOfShards; this.reshardingMetadata = reshardingMetadata; + this.useTimeSeriesSyntheticId = useTimeSeriesSyntheticId; } IndexMetadata withMappingMetadata(MappingMetadata mapping) { @@ -870,7 +874,8 @@ IndexMetadata withMappingMetadata(MappingMetadata mapping) { this.stats, this.writeLoadForecast, this.shardSizeInBytesForecast, - this.reshardingMetadata + this.reshardingMetadata, + this.useTimeSeriesSyntheticId ); } @@ -933,7 +938,8 @@ public IndexMetadata withInSyncAllocationIds(int shardId, Set inSyncSet) this.stats, this.writeLoadForecast, this.shardSizeInBytesForecast, - this.reshardingMetadata + this.reshardingMetadata, + this.useTimeSeriesSyntheticId ); } @@ -1004,7 +1010,8 @@ public IndexMetadata withSetPrimaryTerm(int shardId, long primaryTerm) { this.stats, this.writeLoadForecast, this.shardSizeInBytesForecast, - this.reshardingMetadata + this.reshardingMetadata, + this.useTimeSeriesSyntheticId ); } @@ -1066,7 +1073,8 @@ public IndexMetadata withTimestampRanges(IndexLongFieldRange timestampRange, Ind this.stats, this.writeLoadForecast, this.shardSizeInBytesForecast, - this.reshardingMetadata + this.reshardingMetadata, + this.useTimeSeriesSyntheticId ); } @@ -1123,7 +1131,8 @@ public IndexMetadata withIncrementedVersion() { this.stats, this.writeLoadForecast, this.shardSizeInBytesForecast, - this.reshardingMetadata + this.reshardingMetadata, + this.useTimeSeriesSyntheticId ); } @@ -1314,6 +1323,13 @@ public Instant getTimeSeriesEnd() { return timeSeriesEnd; } + /** + * @return whether the index is a time-series index that uses synthetic ids or not. + */ + public boolean useTimeSeriesSyntheticId() { + return useTimeSeriesSyntheticId; + } + /** * Return the concrete mapping for this index or {@code null} if this index has no mappings at all. */ @@ -2497,6 +2513,14 @@ IndexMetadata build(boolean repair) { String indexModeString = settings.get(IndexSettings.MODE.getKey()); final IndexMode indexMode = indexModeString != null ? IndexMode.fromString(indexModeString.toLowerCase(Locale.ROOT)) : null; final boolean isTsdb = indexMode == IndexMode.TIME_SERIES; + boolean useTimeSeriesSyntheticId = false; + if (isTsdb && indexCreatedVersion.onOrAfter(IndexVersions.TIME_SERIES_USE_SYNTHETIC_ID)) { + var setting = settings.get(IndexSettings.USE_SYNTHETIC_ID.getKey()); + if (setting != null && setting.equalsIgnoreCase(Boolean.TRUE.toString())) { + assert IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG; + useTimeSeriesSyntheticId = true; + } + } return new IndexMetadata( new Index(index, uuid), version, @@ -2546,7 +2570,8 @@ IndexMetadata build(boolean repair) { stats, indexWriteLoadForecast, shardSizeInBytesForecast, - reshardingMetadata + reshardingMetadata, + useTimeSeriesSyntheticId ); } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java index 9eccb663a5362..0cc8167727978 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/IndexRouting.java @@ -26,7 +26,6 @@ import org.elasticsearch.core.Nullable; import org.elasticsearch.features.NodeFeature; import org.elasticsearch.index.IndexMode; -import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.IndexVersions; import org.elasticsearch.index.mapper.TimeSeriesRoutingHashFieldMapper; @@ -336,9 +335,7 @@ public abstract static class ExtractFromSource extends IndexRouting { assert indexMode != null : "Index mode must be set for ExtractFromSource routing"; this.trackTimeSeriesRoutingHash = indexMode == IndexMode.TIME_SERIES && metadata.getCreationVersion().onOrAfter(IndexVersions.TIME_SERIES_ROUTING_HASH_IN_ID); - this.useTimeSeriesSyntheticId = trackTimeSeriesRoutingHash - && metadata.getCreationVersion().onOrAfter(IndexVersions.TIME_SERIES_USE_SYNTHETIC_ID) - && IndexSettings.USE_SYNTHETIC_ID.get(metadata.getSettings()); + this.useTimeSeriesSyntheticId = metadata.useTimeSeriesSyntheticId(); addIdWithRoutingHash = indexMode == IndexMode.LOGSDB; this.parserConfig = XContentParserConfiguration.EMPTY.withFiltering(null, Set.copyOf(includePaths), null, true); } diff --git a/server/src/main/java/org/elasticsearch/index/IndexSettings.java b/server/src/main/java/org/elasticsearch/index/IndexSettings.java index 26f747d2d2315..f7af6bf098761 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSettings.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSettings.java @@ -690,7 +690,19 @@ public boolean isES87TSDBCodecEnabled() { false, new Setting.Validator<>() { @Override - public void validate(Boolean value) {} + public void validate(Boolean enabled) { + if (enabled) { + if (TSDB_SYNTHETIC_ID_FEATURE_FLAG == false) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "The setting [%s] is only permitted when the feature flag is enabled.", + USE_SYNTHETIC_ID.getKey() + ) + ); + } + } + } @Override public void validate(Boolean enabled, Map, Object> settings) { @@ -983,7 +995,7 @@ private void setRetentionLeaseMillis(final TimeValue retentionLease) { private final boolean recoverySourceEnabled; private final boolean recoverySourceSyntheticEnabled; private final boolean useDocValuesSkipper; - private final boolean tsdbSyntheticId; + private final boolean useTimeSeriesSyntheticId; /** * The maximum number of refresh listeners allows on this shard. @@ -1170,8 +1182,28 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti && scopedSettings.get(RECOVERY_USE_SYNTHETIC_SOURCE_SETTING); useDocValuesSkipper = DOC_VALUES_SKIPPER && scopedSettings.get(USE_DOC_VALUES_SKIPPER); seqNoIndexOptions = scopedSettings.get(SEQ_NO_INDEX_OPTIONS_SETTING); - tsdbSyntheticId = TSDB_SYNTHETIC_ID_FEATURE_FLAG && scopedSettings.get(USE_SYNTHETIC_ID); - assert tsdbSyntheticId == false || mode == IndexMode.TIME_SERIES : mode; + final var useSyntheticId = scopedSettings.get(USE_SYNTHETIC_ID); + if (indexMetadata.useTimeSeriesSyntheticId() != useSyntheticId) { + assert false; + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "The setting [%s] is set to [%s] but index metadata has a different value [%s].", + USE_SYNTHETIC_ID.getKey(), + useSyntheticId, + indexMetadata.useTimeSeriesSyntheticId() + ) + ); + } + if (useSyntheticId) { + assert TSDB_SYNTHETIC_ID_FEATURE_FLAG; + assert indexMetadata.useTimeSeriesSyntheticId(); + assert indexMetadata.getIndexMode() == IndexMode.TIME_SERIES : indexMetadata.getIndexMode(); + assert indexMetadata.getCreationVersion().onOrAfter(IndexVersions.TIME_SERIES_USE_SYNTHETIC_ID); + useTimeSeriesSyntheticId = true; + } else { + useTimeSeriesSyntheticId = false; + } if (recoverySourceSyntheticEnabled) { if (DiscoveryNode.isStateless(settings)) { throw new IllegalArgumentException("synthetic recovery source is only allowed in stateful"); @@ -1907,8 +1939,8 @@ public boolean useDocValuesSkipper() { /** * @return Whether the index is a time-series index that use synthetic ids. */ - public boolean useTsdbSyntheticId() { - return tsdbSyntheticId; + public boolean useTimeSeriesSyntheticId() { + return useTimeSeriesSyntheticId; } /** diff --git a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java index 1e2fed61578a5..5d6e377d57db9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/CodecService.java +++ b/server/src/main/java/org/elasticsearch/index/codec/CodecService.java @@ -67,7 +67,7 @@ public CodecService(@Nullable MapperService mapperService, BigArrays bigArrays) for (String codec : Codec.availableCodecs()) { codecs.put(codec, Codec.forName(codec)); } - final boolean useTsdbSyntheticId = mapperService != null && mapperService.getIndexSettings().useTsdbSyntheticId(); + final boolean useTsdbSyntheticId = mapperService != null && mapperService.getIndexSettings().useTimeSeriesSyntheticId(); assert useTsdbSyntheticId == false || mapperService.getIndexSettings().getMode() == IndexMode.TIME_SERIES; this.codecs = codecs.entrySet().stream().collect(Collectors.toUnmodifiableMap(Map.Entry::getKey, e -> { diff --git a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java index a97cfca88e253..d9bf37717edc5 100644 --- a/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java +++ b/server/src/main/java/org/elasticsearch/index/engine/InternalEngine.java @@ -245,7 +245,7 @@ public InternalEngine(EngineConfig engineConfig) { InternalEngine(EngineConfig engineConfig, int maxDocs, BiFunction localCheckpointTrackerSupplier) { super(engineConfig); this.maxDocs = maxDocs; - if (engineConfig.getIndexSettings().useTsdbSyntheticId()) { + if (engineConfig.getIndexSettings().useTimeSeriesSyntheticId()) { logger.info("using TSDB with synthetic id"); useTsdbSyntheticId = true; } else { diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java index 64d834ef3a7bf..bab459269ee04 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java @@ -67,7 +67,7 @@ public static BytesRef createField(DocumentParserContext context, RoutingHashBui || id.equals(indexRouting.createId(context.sourceToParse().getXContentType(), context.sourceToParse().source(), suffix)); } else if (context.sourceToParse().routing() != null) { int routingHash = TimeSeriesRoutingHashFieldMapper.decode(context.sourceToParse().routing()); - if (context.indexSettings().useTsdbSyntheticId()) { + if (context.indexSettings().useTimeSeriesSyntheticId()) { id = createSyntheticId(tsid, timestamp, routingHash); } else { id = createId(routingHash, tsid, timestamp); @@ -98,7 +98,7 @@ public static BytesRef createField(DocumentParserContext context, RoutingHashBui context.id(id); final Field idField; - if (context.indexSettings().useTsdbSyntheticId()) { + if (context.indexSettings().useTimeSeriesSyntheticId()) { idField = syntheticIdField(context.id()); } else { idField = standardIdField(context.id()); diff --git a/server/src/main/java/org/elasticsearch/search/DefaultSearchContext.java b/server/src/main/java/org/elasticsearch/search/DefaultSearchContext.java index bf95bc3ccf69f..0b94b3be3650f 100644 --- a/server/src/main/java/org/elasticsearch/search/DefaultSearchContext.java +++ b/server/src/main/java/org/elasticsearch/search/DefaultSearchContext.java @@ -971,7 +971,7 @@ public IdLoader newIdLoader() { } } } - return IdLoader.createTsIdLoader(indexRouting, routingPaths, indexService.getIndexSettings().useTsdbSyntheticId()); + return IdLoader.createTsIdLoader(indexRouting, routingPaths, indexService.getIndexSettings().useTimeSeriesSyntheticId()); } else { return IdLoader.fromLeafStoredFieldLoader(); } From 4662f945ea9bad25ca7f313052dc27ab1eace426 Mon Sep 17 00:00:00 2001 From: tlrx Date: Mon, 3 Nov 2025 10:25:46 +0100 Subject: [PATCH 10/20] remove update --- .../datastreams/TSDBSyntheticIdsIT.java | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index ca9b3c9a33ef3..a2db96cbafeb8 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -229,20 +229,6 @@ enum Operation { ); } - // Update by synthetic _id - // - // Note: it doesn't work, is that expected? Is is blocked by IndexRouting.ExtractFromSource.updateShard - var updateDocId = randomFrom(docs.keySet()); - var updateDocIndex = docs.get(updateDocId); - var exception = expectThrows(IllegalArgumentException.class, () -> { - var doc = document(timestamp, "vm-dev01", "cpu-load", 10); // update - client().prepareUpdate(updateDocIndex, updateDocId).setDoc(doc).get(); - }); - assertThat( - exception.getMessage(), - containsString("update is not supported because the destination index [" + updateDocIndex + "] is in time_series mode") - ); - flush(dataStreamName); // Check that synthetic _id field have no postings on disk From b3428c7fcfde389b57dac4aa8147b58ec54a07c5 Mon Sep 17 00:00:00 2001 From: tlrx Date: Mon, 3 Nov 2025 10:28:52 +0100 Subject: [PATCH 11/20] startDocID >= 0 --- .../index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java index 71cd99c00440f..e94a1c3e06984 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java @@ -463,7 +463,7 @@ public SeekStatus seekCeil(BytesRef id) throws IOException { // Slow scan to the first document matching the _tsid final int startDocID = docValues.slowScanToFirstDocWithTsIdOrdinalEqualTo(tsIdOrd); - assert 0 <= startDocID : startDocID; + assert startDocID >= 0 : startDocID; int docID = startDocID; int docTsIdOrd = tsIdOrd; From 3f81d6046a48728453b7ede42e3b3271bf404667 Mon Sep 17 00:00:00 2001 From: tlrx Date: Mon, 3 Nov 2025 10:39:03 +0100 Subject: [PATCH 12/20] get from searcher --- .../datastreams/TSDBSyntheticIdsIT.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index a2db96cbafeb8..79892d1a3dd89 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -325,6 +325,23 @@ public void testGetFromTranslogBySyntheticId() throws Exception { flushAndRefresh(dataStreamName); + // Get by synthetic _id + // + // Here we exercise the get-from-searcher and VersionsAndSeqNoResolver.timeSeriesLoadDocIdAndVersion paths. + randomDocs = randomSubsetOf(randomIntBetween(1, results.length), results); + for (var doc : randomDocs) { + var getResponse = client().prepareGet(doc.getIndex(), doc.getId()) + .setRealtime(randomBoolean()) + .setFetchSource(true) + .execute() + .actionGet(); + assertThat(getResponse.isExists(), equalTo(true)); + assertThat(getResponse.getVersion(), equalTo(1L)); + + var source = asInstanceOf(Map.class, getResponse.getSourceAsMap().get("metric")); + assertThat(asInstanceOf(Integer.class, source.get("value")), equalTo(metricOffset + doc.getItemId())); + } + // Check that synthetic _id field have no postings on disk var indices = new HashSet<>(docs.values()); for (var index : indices) { From 15a1e4c19db69e03436763944dc85842eab3b65c Mon Sep 17 00:00:00 2001 From: tlrx Date: Mon, 3 Nov 2025 10:43:42 +0100 Subject: [PATCH 13/20] remove comment --- .../index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java index e94a1c3e06984..f66d1e3508870 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java @@ -287,7 +287,7 @@ private BytesRef lookupTsIdOrd(int tsIdOrdinal) throws IOException { */ private int slowScanToFirstDocWithTsIdOrdinalEqualOrGreaterThan(int tsIdOrd) throws IOException { // recreate even if doc values are already on the same ordinal, to ensure the method returns the first doc - if (tsIdDocValues == null || (cachedTsIdOrd != -1 && cachedTsIdOrd >= tsIdOrd)) { // can't use tsIdDocValues.ordValue() here?? + if (tsIdDocValues == null || (cachedTsIdOrd != -1 && cachedTsIdOrd >= tsIdOrd)) { tsIdDocValues = docValuesProducer.getSorted(tsIdFieldInfo); cachedTsIdOrd = -1; cachedTsId = null; @@ -320,7 +320,7 @@ private int slowScanToFirstDocWithTsIdOrdinalEqualOrGreaterThan(int tsIdOrd) thr */ private int slowScanToFirstDocWithTsIdOrdinalEqualTo(int tsIdOrd) throws IOException { // recreate even if doc values are already on the same ordinal, to ensure the method returns the first doc - if (tsIdDocValues == null || (cachedTsIdOrd != -1 && cachedTsIdOrd >= tsIdOrd)) { // can't use tsIdDocValues.ordValue() here?? + if (tsIdDocValues == null || (cachedTsIdOrd != -1 && cachedTsIdOrd >= tsIdOrd)) { tsIdDocValues = docValuesProducer.getSorted(tsIdFieldInfo); cachedTsIdOrd = -1; cachedTsId = null; From d71316d39ec6553bdf487f13a03a6ee597b3b5fc Mon Sep 17 00:00:00 2001 From: tlrx Date: Mon, 3 Nov 2025 12:56:14 +0100 Subject: [PATCH 14/20] timestamp --- .../org/elasticsearch/index/IndexSortConfig.java | 12 +----------- .../codec/tsdb/TSDBSyntheticIdFieldsProducer.java | 4 ++-- .../index/mapper/TsidExtractingIdFieldMapper.java | 12 ++++++++---- 3 files changed, 11 insertions(+), 17 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java index d3248c76b14db..fd445d470837c 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSortConfig.java @@ -107,7 +107,7 @@ public final class IndexSortConfig { ); public static class IndexSortConfigDefaults { - public static final FieldSortSpec[] TIME_SERIES_SORT, TIME_SERIES_WITH_SYNTHETIC_ID_SORT, HOSTNAME_TIMESTAMP_BWC_SORT; + public static final FieldSortSpec[] TIME_SERIES_SORT, HOSTNAME_TIMESTAMP_BWC_SORT; private static final FieldSortSpec HOSTNAME_SPEC, MESSAGE_PATTERN_SPEC, TIMESTAMP_SPEC; @@ -116,10 +116,6 @@ public static class IndexSortConfigDefaults { TIMESTAMP_SPEC.order = SortOrder.DESC; TIME_SERIES_SORT = new FieldSortSpec[] { new FieldSortSpec(TimeSeriesIdFieldMapper.NAME), TIMESTAMP_SPEC }; - TIME_SERIES_WITH_SYNTHETIC_ID_SORT = new FieldSortSpec[] { - new FieldSortSpec(TimeSeriesIdFieldMapper.NAME), - new FieldSortSpec(DataStreamTimestampFieldMapper.DEFAULT_PATH) }; - HOSTNAME_SPEC = new FieldSortSpec(IndexMode.HOST_NAME); HOSTNAME_SPEC.order = SortOrder.ASC; HOSTNAME_SPEC.missingValue = "_last"; @@ -147,12 +143,6 @@ public static FieldSortSpec[] getDefaultSortSpecs(Settings settings) { } if (IndexMode.TIME_SERIES.getName().equals(indexMode)) { - if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { - var useSyntheticId = settings.get(IndexSettings.USE_SYNTHETIC_ID.getKey()); - if (useSyntheticId != null && useSyntheticId.equalsIgnoreCase(Boolean.TRUE.toString())) { - return TIME_SERIES_WITH_SYNTHETIC_ID_SORT; - } - } return TIME_SERIES_SORT; } else if (IndexMode.LOGSDB.getName().equals(indexMode)) { var version = IndexMetadata.SETTING_INDEX_VERSION_CREATED.get(settings); diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java index f66d1e3508870..1431c10331d69 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdFieldsProducer.java @@ -150,7 +150,7 @@ private static class DocValuesHolder { private final FieldInfo routingHashFieldInfo; private final DocValuesProducer docValuesProducer; - private SortedNumericDocValues timestampDocValues; // sorted asc. order + private SortedNumericDocValues timestampDocValues; // sorted desc. order private SortedDocValues routingHashDocValues; // sorted asc. order private SortedDocValues tsIdDocValues; // sorted asc. order // Keep around the latest tsId ordinal and value @@ -482,7 +482,7 @@ public SeekStatus seekCeil(BytesRef id) throws IOException { return SeekStatus.FOUND; } // Remaining docs don't match, stop here - if (tsIdOrd < docTsIdOrd || timestamp < docTimestamp) { + if (tsIdOrd < docTsIdOrd || docTimestamp < timestamp) { break; } } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java index bab459269ee04..f5cbfb5cd71cb 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/TsidExtractingIdFieldMapper.java @@ -153,7 +153,7 @@ public static String createId( } public static BytesRef createSyntheticIdBytesRef(BytesRef tsid, long timestamp, int routingHash) { - // A synthetic _id is the concatenation of [_tsid (non-fixed length) + timestamp (8 bytes) + routing hash (4 bytes)]. + // A synthetic _id has the format: [_tsid (non-fixed length) + (Long.MAX_VALUE - timestamp) (8 bytes) + routing hash (4 bytes)]. // We dont' use hashing here because we need to be able to extract the concatenated values from the _id in various places, like // when applying doc values updates in Lucene, or when routing GET or DELETE requests to the corresponding shard, or when replaying // translog operations. Since the synthetic _id is not indexed and not really stored on disk we consider it fine if it is longer @@ -161,10 +161,11 @@ public static BytesRef createSyntheticIdBytesRef(BytesRef tsid, long timestamp, // // Also, when applying doc values updates Lucene expects _id to be sorted: it stops applying updates for a term "_id:ABC" if it // seeks to a term "BCD" as it knows there won't be more documents matching "_id:ABC" past the term "BCD". So it is important to - // generate an _id that reflects the ordering of the terms it is synthesized from, ie _tsid and @timestamp. + // generate an _id as a byte array whose lexicographical order reflects the order of the documents in the segment. For this reason, + // the timestamp is stored in the synthetic _id as (Long.MAX_VALUE - timestamp). byte[] bytes = new byte[tsid.length + Long.BYTES + Integer.BYTES]; System.arraycopy(tsid.bytes, tsid.offset, bytes, 0, tsid.length); - ByteUtils.writeLongBE(timestamp, bytes, tsid.length); // Big Endian as we want to most significant byte first + ByteUtils.writeLongBE(Long.MAX_VALUE - timestamp, bytes, tsid.length); // Big Endian as we want to most significant byte first ByteUtils.writeIntBE(routingHash, bytes, tsid.length + Long.BYTES); return new BytesRef(bytes); } @@ -185,7 +186,10 @@ public static BytesRef extractTimeSeriesIdFromSyntheticId(BytesRef id) { public static long extractTimestampFromSyntheticId(BytesRef id) { assert id.length > Long.BYTES + Integer.BYTES; // See #createSyntheticId - return ByteUtils.readLongBE(id.bytes, id.offset + id.length - Long.BYTES - Integer.BYTES); + long delta = ByteUtils.readLongBE(id.bytes, id.offset + id.length - Long.BYTES - Integer.BYTES); + long timestamp = Long.MAX_VALUE - delta; + assert timestamp >= 0 : delta; + return timestamp; } public static int extractRoutingHashFromSyntheticId(BytesRef id) { From 96eb36a70a132a75f054a3209e52be5a55901720 Mon Sep 17 00:00:00 2001 From: Tanguy Leroux Date: Mon, 3 Nov 2025 12:58:55 +0100 Subject: [PATCH 15/20] Update docs/changelog/137274.yaml --- docs/changelog/137274.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/137274.yaml diff --git a/docs/changelog/137274.yaml b/docs/changelog/137274.yaml new file mode 100644 index 0000000000000..c26c0940f4a51 --- /dev/null +++ b/docs/changelog/137274.yaml @@ -0,0 +1,5 @@ +pr: 137274 +summary: Use a new synthetic `_id` format for time-series datastreams +area: TSDB +type: enhancement +issues: [] From 136a267e626d9c3b6176bedde6007de3ad1bce09 Mon Sep 17 00:00:00 2001 From: tlrx Date: Mon, 3 Nov 2025 13:56:15 +0100 Subject: [PATCH 16/20] ensure no postings --- .../datastreams/TSDBSyntheticIdsIT.java | 2 + .../codec/tsdb/TSDBSyntheticIdCodec.java | 80 ++++++++++++++++--- 2 files changed, 71 insertions(+), 11 deletions(-) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index 79892d1a3dd89..1de17ba893916 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -25,6 +25,7 @@ import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.engine.EngineConfig; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.plugins.Plugin; @@ -388,6 +389,7 @@ private static void putDataStreamTemplate(String indexPattern, int shards) throw final var settings = indexSettings(shards, 0).put(IndexSettings.MODE.getKey(), IndexMode.TIME_SERIES.getName()) .put(IndexSettings.BLOOM_FILTER_ID_FIELD_ENABLED_SETTING.getKey(), false) .put(IndexSettings.INDEX_REFRESH_INTERVAL_SETTING.getKey(), -1) + .put(EngineConfig.USE_COMPOUND_FILE, false) .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), true); final var mappings = """ diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java index 4d885fbc88e1a..f44741c0e6c68 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java @@ -11,12 +11,19 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.elasticsearch.index.mapper.SyntheticIdField; @@ -36,21 +43,25 @@ * of terms and postings on the field (now called a "synthetic _id" field) as if it was backed by an in inverted index. *

*

- * In order to do this, it enforces synthetic _id fields to be indexed with the {@link IndexOptions#NONE} option, hence preventing the - * building of a term dictionary with postings lists. The codec also changes this {@link IndexOptions#NONE} option back to - * {@link IndexOptions#DOCS} when reading the {@link FieldInfos} during the opening of a new segment core reader. This allows to use a - * Lucene term dictionary on top of a synthetic _id field that does not have corresponding postings files on disk. Finally, the codec - * injects additional {@link FieldInfos} attributes so that Lucene's {@link PerFieldPostingsFormat} correctly instantiates a - * {@link TSDBSyntheticIdPostingsFormat} to access the term and postings of the synthetic _id field. + * In order to do this, it wraps the default postings format with an implementation that throws an {@link IllegalArgumentException} if + * a Lucene field with the name {@code _id} produces terms (ie, has postings) during indexing. It also overwrites the {@link FieldInfos} + * to ensure that the {@code _id} field information has the {@link IndexOptions#NONE} option when written to disk. It also changes this + * {@link IndexOptions#NONE} option back to {@link IndexOptions#DOCS} when reading the {@link FieldInfos} during the opening of a new + * segment core reader. This allows to use a Lucene term dictionary on top of a synthetic _id field that does not have corresponding + * postings files on disk. Finally, the codec injects additional {@link FieldInfos} attributes so that Lucene's + * {@link PerFieldPostingsFormat} correctly instantiates a {@link TSDBSyntheticIdPostingsFormat} to access the term and postings of the + * synthetic _id field. *

*/ public class TSDBSyntheticIdCodec extends FilterCodec { - private final TSDBSyntheticIdFieldInfosFormat fieldInfosFormat; + private final RewriteFieldInfosFormat fieldInfosFormat; + private final EnsureNoPostingsFormat postingsFormat; public TSDBSyntheticIdCodec(String name, Codec delegate) { super(name, delegate); - this.fieldInfosFormat = new TSDBSyntheticIdFieldInfosFormat(delegate.fieldInfosFormat()); + this.fieldInfosFormat = new RewriteFieldInfosFormat(delegate.fieldInfosFormat()); + this.postingsFormat = new EnsureNoPostingsFormat(delegate.postingsFormat()); } @Override @@ -58,14 +69,19 @@ public final FieldInfosFormat fieldInfosFormat() { return fieldInfosFormat; } + @Override + public PostingsFormat postingsFormat() { + return postingsFormat; + } + /** - * {@link FieldInfosFormat} that ensures the _id field is synthetic + * {@link FieldInfosFormat} that overwrites the {@link FieldInfos}. */ - private static class TSDBSyntheticIdFieldInfosFormat extends FieldInfosFormat { + private static class RewriteFieldInfosFormat extends FieldInfosFormat { private final FieldInfosFormat delegate; - private TSDBSyntheticIdFieldInfosFormat(FieldInfosFormat delegate) { + private RewriteFieldInfosFormat(FieldInfosFormat delegate) { this.delegate = delegate; } @@ -206,4 +222,46 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm return new FieldInfos(infos); } } + + /** + * {@link PostingsFormat} that throws an {@link IllegalArgumentException} if a Lucene field with the name {@code _id} has postings + * produces during indexing. + */ + private static class EnsureNoPostingsFormat extends PostingsFormat { + + private final PostingsFormat delegate; + + private EnsureNoPostingsFormat(PostingsFormat delegate) { + super(delegate.getName()); + this.delegate = delegate; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + final var consumer = delegate.fieldsConsumer(state); + return new FieldsConsumer() { + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + for (var field : fields) { + if (SYNTHETIC_ID.equalsIgnoreCase(field)) { + var message = "Field [" + SYNTHETIC_ID + "] has terms produced during indexing"; + assert false : message; + throw new IllegalArgumentException(message); + } + } + consumer.write(fields, norms); + } + + @Override + public void close() throws IOException { + consumer.close(); + } + }; + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + return delegate.fieldsProducer(state); + } + } } From 3b22c46861606b393b4b9c10e67f0b6b048ba517 Mon Sep 17 00:00:00 2001 From: tlrx Date: Mon, 3 Nov 2025 15:50:13 +0100 Subject: [PATCH 17/20] remove sort --- .../org/elasticsearch/index/IndexMode.java | 45 ++++++++----------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/index/IndexMode.java b/server/src/main/java/org/elasticsearch/index/IndexMode.java index b3df1bfd8ea5a..10e604126f934 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexMode.java +++ b/server/src/main/java/org/elasticsearch/index/IndexMode.java @@ -42,13 +42,15 @@ import java.io.IOException; import java.time.Instant; import java.util.Arrays; -import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.function.BooleanSupplier; import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static java.util.stream.Collectors.toSet; /** * "Mode" that controls which behaviors and settings an index supports. @@ -139,16 +141,7 @@ void validateWithOtherSettings(Map, Object> settings) { throw new IllegalArgumentException(error(IndexMetadata.INDEX_ROUTING_PARTITION_SIZE_SETTING)); } - Settings settingsWithIndexMode; - if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { - settingsWithIndexMode = Settings.builder() - .put(IndexSettings.MODE.getKey(), getName()) - // Default values of some index sort settings depend of the feature flag and USE_SYNTHETIC_ID setting - .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), (Boolean) settings.get(IndexSettings.USE_SYNTHETIC_ID)) - .build(); - } else { - settingsWithIndexMode = Settings.builder().put(IndexSettings.MODE.getKey(), getName()).build(); - } + var settingsWithIndexMode = Settings.builder().put(IndexSettings.MODE.getKey(), getName()).build(); for (Setting unsupported : TIME_SERIES_UNSUPPORTED) { if (false == Objects.equals(unsupported.getDefault(settingsWithIndexMode), settings.get(unsupported))) { @@ -467,22 +460,20 @@ private static CompressedXContent createDefaultMapping(boolean includeHostName) IndexSortConfig.INDEX_SORT_MISSING_SETTING ); - static final List> VALIDATE_WITH_SETTINGS; - static { - var settings = new HashSet>(); - settings.add(IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING); - settings.add(IndexMetadata.INDEX_ROUTING_PARTITION_SIZE_SETTING); - settings.add(IndexMetadata.INDEX_ROUTING_PATH); - settings.add(IndexMetadata.INDEX_DIMENSIONS); - settings.add(IndexSettings.LOGSDB_ROUTE_ON_SORT_FIELDS); - settings.add(IndexSettings.TIME_SERIES_START_TIME); - settings.add(IndexSettings.TIME_SERIES_END_TIME); - if (IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG) { - settings.add(IndexSettings.USE_SYNTHETIC_ID); - } - settings.addAll(TIME_SERIES_UNSUPPORTED); - VALIDATE_WITH_SETTINGS = List.copyOf(settings); - } + static final List> VALIDATE_WITH_SETTINGS = List.copyOf( + Stream.concat( + Stream.of( + IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING, + IndexMetadata.INDEX_ROUTING_PARTITION_SIZE_SETTING, + IndexMetadata.INDEX_ROUTING_PATH, + IndexMetadata.INDEX_DIMENSIONS, + IndexSettings.LOGSDB_ROUTE_ON_SORT_FIELDS, + IndexSettings.TIME_SERIES_START_TIME, + IndexSettings.TIME_SERIES_END_TIME + ), + TIME_SERIES_UNSUPPORTED.stream() + ).collect(toSet()) + ); private final String name; From 546e23bc0c31c3730e3af393f9f62886161b0429 Mon Sep 17 00:00:00 2001 From: tlrx Date: Tue, 4 Nov 2025 10:33:53 +0100 Subject: [PATCH 18/20] remove compound --- .../java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index 1de17ba893916..79892d1a3dd89 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -25,7 +25,6 @@ import org.elasticsearch.common.util.set.Sets; import org.elasticsearch.index.IndexMode; import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.index.engine.EngineConfig; import org.elasticsearch.index.mapper.IdFieldMapper; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.plugins.Plugin; @@ -389,7 +388,6 @@ private static void putDataStreamTemplate(String indexPattern, int shards) throw final var settings = indexSettings(shards, 0).put(IndexSettings.MODE.getKey(), IndexMode.TIME_SERIES.getName()) .put(IndexSettings.BLOOM_FILTER_ID_FIELD_ENABLED_SETTING.getKey(), false) .put(IndexSettings.INDEX_REFRESH_INTERVAL_SETTING.getKey(), -1) - .put(EngineConfig.USE_COMPOUND_FILE, false) .put(IndexSettings.USE_SYNTHETIC_ID.getKey(), true); final var mappings = """ From 3655dc314a036320367f639e237140de7dcd5b0b Mon Sep 17 00:00:00 2001 From: tlrx Date: Wed, 5 Nov 2025 10:35:40 +0100 Subject: [PATCH 19/20] feedback --- .../datastreams/TSDBSyntheticIdsIT.java | 29 +++++++++++++++++++ .../codec/tsdb/TSDBSyntheticIdCodec.java | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java index 79892d1a3dd89..654051b9e13f5 100644 --- a/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java +++ b/modules/data-streams/src/internalClusterTest/java/org/elasticsearch/datastreams/TSDBSyntheticIdsIT.java @@ -109,6 +109,7 @@ public void testSyntheticId() throws Exception { final var docs = new HashMap(); final var unit = randomFrom(ChronoUnit.SECONDS, ChronoUnit.MINUTES); final var timestamp = Instant.now(); + logger.info("timestamp is " + timestamp); // Index 10 docs in datastream // @@ -200,6 +201,34 @@ enum Operation { assertThat(deleteResponse.getVersion(), equalTo(2L)); } + // Index more random docs + if (randomBoolean()) { + int nbDocs = randomIntBetween(1, 100); + final var arrayOfDocs = new XContentBuilder[nbDocs]; + + var t = timestamp.plus(4, unit); // t + 4s, no overlap with previous docs + while (nbDocs > 0) { + var hosts = randomSubsetOf(List.of("vm-dev01", "vm-dev02", "vm-dev03")); + for (var host : hosts) { + if (--nbDocs < 0) { + break; + } + arrayOfDocs[nbDocs] = document(t, host, "cpu-load", randomInt(10)); + } + // always use seconds, otherwise the doc might fell outside of the timestamps window of the datastream + t = t.plus(1, ChronoUnit.SECONDS); + } + + results = createDocuments(dataStreamName, arrayOfDocs); + + // Verify that documents are created + for (var result : results) { + assertThat(result.getResponse().getResult(), equalTo(DocWriteResponse.Result.CREATED)); + assertThat(result.getVersion(), equalTo(1L)); + docs.put(result.getId(), result.getIndex()); + } + } + refresh(dataStreamName); assertCheckedResponse(client().prepareSearch(dataStreamName).setTrackTotalHits(true).setSize(100), searchResponse -> { diff --git a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java index f44741c0e6c68..aa6936cb65df9 100644 --- a/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java +++ b/server/src/main/java/org/elasticsearch/index/codec/tsdb/TSDBSyntheticIdCodec.java @@ -225,7 +225,7 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm /** * {@link PostingsFormat} that throws an {@link IllegalArgumentException} if a Lucene field with the name {@code _id} has postings - * produces during indexing. + * produced during indexing. */ private static class EnsureNoPostingsFormat extends PostingsFormat { From 608ff674cd870bd5574c62682139de356f081672 Mon Sep 17 00:00:00 2001 From: tlrx Date: Wed, 5 Nov 2025 13:23:12 +0100 Subject: [PATCH 20/20] fix setting registration --- .../org/elasticsearch/cluster/metadata/IndexMetadata.java | 4 +++- .../src/main/java/org/elasticsearch/index/IndexSettings.java | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java b/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java index 6fee4b39dbe22..094626018d449 100644 --- a/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java +++ b/server/src/main/java/org/elasticsearch/cluster/metadata/IndexMetadata.java @@ -2514,7 +2514,9 @@ IndexMetadata build(boolean repair) { final IndexMode indexMode = indexModeString != null ? IndexMode.fromString(indexModeString.toLowerCase(Locale.ROOT)) : null; final boolean isTsdb = indexMode == IndexMode.TIME_SERIES; boolean useTimeSeriesSyntheticId = false; - if (isTsdb && indexCreatedVersion.onOrAfter(IndexVersions.TIME_SERIES_USE_SYNTHETIC_ID)) { + if (isTsdb + && IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG + && indexCreatedVersion.onOrAfter(IndexVersions.TIME_SERIES_USE_SYNTHETIC_ID)) { var setting = settings.get(IndexSettings.USE_SYNTHETIC_ID.getKey()); if (setting != null && setting.equalsIgnoreCase(Boolean.TRUE.toString())) { assert IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG; diff --git a/server/src/main/java/org/elasticsearch/index/IndexSettings.java b/server/src/main/java/org/elasticsearch/index/IndexSettings.java index f7af6bf098761..81b4bbab69756 100644 --- a/server/src/main/java/org/elasticsearch/index/IndexSettings.java +++ b/server/src/main/java/org/elasticsearch/index/IndexSettings.java @@ -1182,7 +1182,7 @@ public IndexSettings(final IndexMetadata indexMetadata, final Settings nodeSetti && scopedSettings.get(RECOVERY_USE_SYNTHETIC_SOURCE_SETTING); useDocValuesSkipper = DOC_VALUES_SKIPPER && scopedSettings.get(USE_DOC_VALUES_SKIPPER); seqNoIndexOptions = scopedSettings.get(SEQ_NO_INDEX_OPTIONS_SETTING); - final var useSyntheticId = scopedSettings.get(USE_SYNTHETIC_ID); + final var useSyntheticId = IndexSettings.TSDB_SYNTHETIC_ID_FEATURE_FLAG && scopedSettings.get(USE_SYNTHETIC_ID); if (indexMetadata.useTimeSeriesSyntheticId() != useSyntheticId) { assert false; throw new IllegalArgumentException(