diff --git a/docs/changelog/138029.yaml b/docs/changelog/138029.yaml new file mode 100644 index 0000000000000..977bc60d06daf --- /dev/null +++ b/docs/changelog/138029.yaml @@ -0,0 +1,5 @@ +pr: 138029 +summary: Fuse MV_MIN and MV_MAX and document process +area: ES|QL +type: feature +issues: [] diff --git a/muted-tests.yml b/muted-tests.yml index 235b1f60de5cd..16d503633bc78 100644 --- a/muted-tests.yml +++ b/muted-tests.yml @@ -438,6 +438,9 @@ tests: - class: org.elasticsearch.xpack.esql.heap_attack.HeapAttackLookupJoinIT method: testLookupExplosionBigString issue: https://github.com/elastic/elasticsearch/issues/138510 +- class: org.elasticsearch.xpack.esql.qa.single_node.GenerativeForkIT + method: test {csv-spec:inlinestats.MvMinMvExpand} + issue: https://github.com/elastic/elasticsearch/issues/137679 - class: org.elasticsearch.xpack.esql.optimizer.rules.physical.local.SubstituteRoundToTests method: testSubqueryWithCountStarAndDateTrunc {default} issue: https://github.com/elastic/elasticsearch/issues/138601 diff --git a/server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java b/server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java index 8e5d247aaccb0..341d842a9f7b2 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/BlockLoader.java @@ -17,6 +17,8 @@ import org.elasticsearch.core.Nullable; import org.elasticsearch.core.Releasable; import org.elasticsearch.index.mapper.blockloader.docvalues.BlockDocValuesReader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMaxLongsFromDocValuesBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMinLongsFromDocValuesBlockLoader; import org.elasticsearch.search.fetch.StoredFieldsSpec; import org.elasticsearch.search.lookup.Source; @@ -25,8 +27,139 @@ import java.util.Map; /** - * Interface for loading data in a block shape. Instances of this class - * must be immutable and thread safe. + * Loads values from a chunk of lucene documents into a "Block" for the compute engine. + *

+ * Think of a Block as an array of values for a sequence of lucene documents. That's + * almost true! For the purposes of implementing {@link BlockLoader}, it's close enough. + * The compute engine operates on arrays because the good folks that build CPUs have + * spent the past 40 years making them really really good at running tight loops over + * arrays of data. So we play along with the CPU and make arrays. + *

+ *

How to implement

+ *

+ * There are a lot of interesting choices hiding in here to make getting those arrays + * out of lucene work well: + *

+ * + *

How many to implement

+ *

+ * Generally reads are faster from {@code doc_values}, slower from {@code stored} fields, + * and even slower from {@code _source}. If we get to chose, we pick {@code doc_values}. + * But we work with what's on disk and that's a product of the field type and what the user's + * configured. Picking the optimal choice given what's on disk is the responsibility of each + * field's {@link MappedFieldType#blockLoader} method. The more configurable the field's + * storage strategies the more {@link BlockLoader}s you have to implement to integrate it + * with ESQL. It can get to be a lot. Sorry. + *

+ *

+ * For a field to be supported by ESQL fully it has to be loadable if it was configured to be + * stored in any way. It's possible to turn off storage entirely by turning off + * {@code doc_values} and {@code _source} and {@code stored} fields. In that case, it's + * acceptable to return {@link ConstantNullsReader}. User turned the field off, best we can do + * is {@code null}. + *

+ *

+ * We also sometimes want to "push" executing some ESQL functions into the block loader itself. + * Usually we do this when it's a ton faster. See the docs for {@code BlockLoaderExpression} + * for why and how we do this. + *

+ *

+ * For example, {@code long} fields implement these block loaders: + *

+ * + *

+ * NOTE: We can't read from {@code long}s from {@code stored} fields which is a + * bug, but maybe not + * a terrible one because it's very uncommon to configure {@code long} to be {@code stored} + * but to disable {@code _source} and {@code doc_values}. Nothing's perfect. Especially + * code. + *

+ *

Why is {@link AllReader}?

+ *

+ * When we described how to read from {@code doc_values} we said we prefer + * to use {@link ColumnAtATimeReader}. But some callers don't support reading column-at-a-time + * and need to read row-by-row. So we also need an implementation of {@link RowStrideReader} + * that reads from {@code doc_values}. Usually it's most convenient to implement both of those + * in the same {@code class}. {@link AllReader} is an interface for those sorts of classes, and + * you'll see it in the {@code doc_values} code frequently. + *

+ *

Why is {@link #rowStrideStoredFieldSpec}?

+ *

+ * When decompressing {@code stored} fields lucene can skip stored field that aren't used. They + * still have to be decompressed, but they aren't turned into java objects which saves a fair bit + * of work. If you don't need any stored fields return {@link StoredFieldsSpec#NO_REQUIREMENTS}. + * Otherwise, return what you need. + *

+ *

Thread safety

+ *

+ * Instances of this class must be immutable and thread safe. Instances of + * {@link ColumnAtATimeReader} and {@link RowStrideReader} are all mutable and can only + * be accessed by one thread at a time but may be passed between threads. + * See implementations {@link Reader#canReuse} for how that's handled. "Normal" java objects + * don't need to do anything special to be kicked from thread to thread - the transfer itself + * establishes a {@code happens-before} relationship that makes everything you need visible. + * But Lucene's readers aren't "normal" java objects and sometimes need to be rebuilt if we + * shift threads. + *

*/ public interface BlockLoader { /** @@ -115,10 +248,26 @@ interface StoredFields { Map> storedFields() throws IOException; } + /** + * Build a column-at-a-time reader. May return {@code null} + * if the underlying storage needs to be loaded row-by-row. Callers should try + * this first, only falling back to {@link #rowStrideReader} if this returns + * {@code null} or if they can't load column-at-a-time themselves. + */ + @Nullable ColumnAtATimeReader columnAtATimeReader(LeafReaderContext context) throws IOException; + /** + * Build a row-by-row reader. Must never return {@code null}, + * evan if the underlying storage prefers to be loaded column-at-a-time. Some + * callers simply can't load column-at-a-time so all implementations must support + * this method. + */ RowStrideReader rowStrideReader(LeafReaderContext context) throws IOException; + /** + * What {@code stored} fields are needed by this reader. + */ StoredFieldsSpec rowStrideStoredFieldSpec(); /** @@ -540,8 +689,13 @@ Block buildExponentialHistogramBlockDirect( } /** - * Marker interface for block results. The compute engine has a fleshed - * out implementation. + * A columnar representation of homogenous data. It has a position (row) count, and + * various data retrieval methods for accessing the underlying data that is stored at a given + * position. In other words, a fancy wrapper over an array. + *

+ * This is just a marker interface for these results. The compute engine + * has fleshed out implementations. + *

*/ interface Block extends Releasable {} diff --git a/server/src/main/java/org/elasticsearch/index/mapper/IpFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/IpFieldMapper.java index 5f5994b18acba..5d9d5d7ce7582 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/IpFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/IpFieldMapper.java @@ -33,7 +33,10 @@ import org.elasticsearch.index.fielddata.FieldDataContext; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.plain.SortedSetOrdinalsIndexFieldData; +import org.elasticsearch.index.mapper.blockloader.BlockLoaderFunctionConfig; import org.elasticsearch.index.mapper.blockloader.docvalues.BytesRefsFromOrdsBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMaxBytesRefsFromOrdsBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMinBytesRefsFromOrdsBlockLoader; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.script.IpFieldScript; import org.elasticsearch.script.Script; @@ -457,7 +460,15 @@ public static Query rangeQuery( @Override public BlockLoader blockLoader(BlockLoaderContext blContext) { if (hasDocValues() && (blContext.fieldExtractPreference() != FieldExtractPreference.STORED || isSyntheticSource)) { - return new BytesRefsFromOrdsBlockLoader(name()); + BlockLoaderFunctionConfig cfg = blContext.blockLoaderFunctionConfig(); + if (cfg == null) { + return new BytesRefsFromOrdsBlockLoader(name()); + } + return switch (cfg.function()) { + case MV_MAX -> new MvMaxBytesRefsFromOrdsBlockLoader(name()); + case MV_MIN -> new MvMinBytesRefsFromOrdsBlockLoader(name()); + default -> throw new UnsupportedOperationException("unknown fusion config [" + cfg.function() + "]"); + }; } if (isStored()) { @@ -475,6 +486,17 @@ public BlockLoader blockLoader(BlockLoaderContext blContext) { return new BlockSourceReader.IpsBlockLoader(sourceValueFetcher(blContext), lookup); } + @Override + public boolean supportsBlockLoaderConfig(BlockLoaderFunctionConfig config, FieldExtractPreference preference) { + if (hasDocValues() && (preference != FieldExtractPreference.STORED || isSyntheticSource)) { + return switch (config.function()) { + case MV_MAX, MV_MIN -> true; + default -> false; + }; + } + return true; + } + private BlockLoader blockLoaderFromFallbackSyntheticSource(BlockLoaderContext blContext) { var reader = new IpFallbackSyntheticSourceReader(nullValue); return new FallbackSyntheticSourceBlockLoader( diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 31fd1e404d108..c84c308afdf11 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -61,6 +61,8 @@ import org.elasticsearch.index.fielddata.plain.SortedSetOrdinalsIndexFieldData; import org.elasticsearch.index.mapper.blockloader.BlockLoaderFunctionConfig; import org.elasticsearch.index.mapper.blockloader.docvalues.BytesRefsFromOrdsBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMaxBytesRefsFromOrdsBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMinBytesRefsFromOrdsBlockLoader; import org.elasticsearch.index.mapper.blockloader.docvalues.fn.Utf8CodePointsFromOrdsBlockLoader; import org.elasticsearch.index.query.AutomatonQueryWithDescription; import org.elasticsearch.index.query.SearchExecutionContext; @@ -732,10 +734,13 @@ public BlockLoader blockLoader(BlockLoaderContext blContext) { if (cfg == null) { return new BytesRefsFromOrdsBlockLoader(name()); } - if (cfg.function() == BlockLoaderFunctionConfig.Function.LENGTH) { - return new Utf8CodePointsFromOrdsBlockLoader(((BlockLoaderFunctionConfig.JustWarnings) cfg).warnings(), name()); - } - throw new UnsupportedOperationException("unknown fusion config [" + cfg.function() + "]"); + return switch (cfg.function()) { + case LENGTH -> new Utf8CodePointsFromOrdsBlockLoader(((BlockLoaderFunctionConfig.JustWarnings) cfg).warnings(), name()); + case MV_MAX -> new MvMaxBytesRefsFromOrdsBlockLoader(name()); + case MV_MIN -> new MvMinBytesRefsFromOrdsBlockLoader(name()); + default -> throw new UnsupportedOperationException("unknown fusion config [" + cfg.function() + "]"); + }; + } if (blContext.blockLoaderFunctionConfig() != null) { throw new UnsupportedOperationException("function fusing only supported for doc values"); @@ -765,7 +770,10 @@ public Builder builder(BlockFactory factory, int expectedCount) { @Override public boolean supportsBlockLoaderConfig(BlockLoaderFunctionConfig config, FieldExtractPreference preference) { if (hasDocValues() && (preference != FieldExtractPreference.STORED || isSyntheticSourceEnabled())) { - return config.function() == BlockLoaderFunctionConfig.Function.LENGTH; + return switch (config.function()) { + case LENGTH, MV_MAX, MV_MIN -> true; + default -> false; + }; } return false; } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/NumberFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/NumberFieldMapper.java index 65d3390309338..57c4a8db113df 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/NumberFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/NumberFieldMapper.java @@ -45,9 +45,16 @@ import org.elasticsearch.index.fielddata.plain.SortedDoublesIndexFieldData; import org.elasticsearch.index.fielddata.plain.SortedNumericIndexFieldData; import org.elasticsearch.index.mapper.TimeSeriesParams.MetricType; +import org.elasticsearch.index.mapper.blockloader.BlockLoaderFunctionConfig; import org.elasticsearch.index.mapper.blockloader.docvalues.DoublesBlockLoader; import org.elasticsearch.index.mapper.blockloader.docvalues.IntsBlockLoader; import org.elasticsearch.index.mapper.blockloader.docvalues.LongsBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMaxDoublesFromDocValuesBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMaxIntsFromDocValuesBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMaxLongsFromDocValuesBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMinDoublesFromDocValuesBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMinIntsFromDocValuesBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMinLongsFromDocValuesBlockLoader; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.script.DoubleFieldScript; import org.elasticsearch.script.LongFieldScript; @@ -487,6 +494,16 @@ BlockLoader blockLoaderFromFallbackSyntheticSource( ) { return floatingPointBlockLoaderFromFallbackSyntheticSource(this, fieldName, nullValue, coerce, blContext); } + + @Override + BlockLoader blockLoaderFromDocValuesMvMin(String fieldName) { + return new MvMinDoublesFromDocValuesBlockLoader(fieldName, l -> HalfFloatPoint.sortableShortToHalfFloat((short) l)); + } + + @Override + BlockLoader blockLoaderFromDocValuesMvMax(String fieldName) { + return new MvMaxDoublesFromDocValuesBlockLoader(fieldName, l -> HalfFloatPoint.sortableShortToHalfFloat((short) l)); + } }, FLOAT("float", NumericType.FLOAT) { @Override @@ -685,6 +702,16 @@ BlockLoader blockLoaderFromFallbackSyntheticSource( ) { return floatingPointBlockLoaderFromFallbackSyntheticSource(this, fieldName, nullValue, coerce, blContext); } + + @Override + BlockLoader blockLoaderFromDocValuesMvMin(String fieldName) { + return new MvMinDoublesFromDocValuesBlockLoader(fieldName, l -> NumericUtils.sortableIntToFloat((int) l)); + } + + @Override + BlockLoader blockLoaderFromDocValuesMvMax(String fieldName) { + return new MvMaxDoublesFromDocValuesBlockLoader(fieldName, l -> NumericUtils.sortableIntToFloat((int) l)); + } }, DOUBLE("double", NumericType.DOUBLE) { @Override @@ -849,6 +876,16 @@ BlockLoader blockLoaderFromFallbackSyntheticSource( ) { return floatingPointBlockLoaderFromFallbackSyntheticSource(this, fieldName, nullValue, coerce, blContext); } + + @Override + BlockLoader blockLoaderFromDocValuesMvMin(String fieldName) { + return new MvMinDoublesFromDocValuesBlockLoader(fieldName, NumericUtils::sortableLongToDouble); + } + + @Override + BlockLoader blockLoaderFromDocValuesMvMax(String fieldName) { + return new MvMaxDoublesFromDocValuesBlockLoader(fieldName, NumericUtils::sortableLongToDouble); + } }, BYTE("byte", NumericType.BYTE) { @Override @@ -978,6 +1015,16 @@ BlockLoader blockLoaderFromFallbackSyntheticSource( return integerBlockLoaderFromFallbackSyntheticSource(this, fieldName, nullValue, coerce, blContext); } + @Override + BlockLoader blockLoaderFromDocValuesMvMin(String fieldName) { + return new MvMinIntsFromDocValuesBlockLoader(fieldName); + } + + @Override + BlockLoader blockLoaderFromDocValuesMvMax(String fieldName) { + return new MvMaxIntsFromDocValuesBlockLoader(fieldName); + } + private boolean isOutOfRange(Object value) { double doubleValue = objectToDouble(value); return doubleValue < Byte.MIN_VALUE || doubleValue > Byte.MAX_VALUE; @@ -1106,6 +1153,16 @@ BlockLoader blockLoaderFromFallbackSyntheticSource( return integerBlockLoaderFromFallbackSyntheticSource(this, fieldName, nullValue, coerce, blContext); } + @Override + BlockLoader blockLoaderFromDocValuesMvMin(String fieldName) { + return new MvMinIntsFromDocValuesBlockLoader(fieldName); + } + + @Override + BlockLoader blockLoaderFromDocValuesMvMax(String fieldName) { + return new MvMaxIntsFromDocValuesBlockLoader(fieldName); + } + private boolean isOutOfRange(Object value) { double doubleValue = objectToDouble(value); return doubleValue < Short.MIN_VALUE || doubleValue > Short.MAX_VALUE; @@ -1311,6 +1368,16 @@ BlockLoader blockLoaderFromFallbackSyntheticSource( ) { return integerBlockLoaderFromFallbackSyntheticSource(this, fieldName, nullValue, coerce, blContext); } + + @Override + BlockLoader blockLoaderFromDocValuesMvMin(String fieldName) { + return new MvMinIntsFromDocValuesBlockLoader(fieldName); + } + + @Override + BlockLoader blockLoaderFromDocValuesMvMax(String fieldName) { + return new MvMaxIntsFromDocValuesBlockLoader(fieldName); + } }, LONG("long", NumericType.LONG) { @Override @@ -1497,6 +1564,16 @@ public Builder builder(BlockFactory factory, int expectedCount) { }; } + @Override + BlockLoader blockLoaderFromDocValuesMvMin(String fieldName) { + return new MvMinLongsFromDocValuesBlockLoader(fieldName); + } + + @Override + BlockLoader blockLoaderFromDocValuesMvMax(String fieldName) { + return new MvMaxLongsFromDocValuesBlockLoader(fieldName); + } + private boolean isOutOfRange(Object value) { if (value instanceof Long) { return false; @@ -1766,6 +1843,10 @@ abstract BlockLoader blockLoaderFromFallbackSyntheticSource( MappedFieldType.BlockLoaderContext blContext ); + abstract BlockLoader blockLoaderFromDocValuesMvMin(String fieldName); + + abstract BlockLoader blockLoaderFromDocValuesMvMax(String fieldName); + // All values that fit into integer are returned as integers private static BlockLoader integerBlockLoaderFromFallbackSyntheticSource( NumberType type, @@ -2021,7 +2102,15 @@ public Function pointReaderIfPossible() { @Override public BlockLoader blockLoader(BlockLoaderContext blContext) { if (hasDocValues() && (blContext.fieldExtractPreference() != FieldExtractPreference.STORED || isSyntheticSource)) { - return type.blockLoaderFromDocValues(name()); + BlockLoaderFunctionConfig cfg = blContext.blockLoaderFunctionConfig(); + if (cfg == null) { + return type.blockLoaderFromDocValues(name()); + } + return switch (cfg.function()) { + case MV_MAX -> type.blockLoaderFromDocValuesMvMax(name()); + case MV_MIN -> type.blockLoaderFromDocValuesMvMin(name()); + default -> throw new UnsupportedOperationException("unknown fusion config [" + cfg.function() + "]"); + }; } // Multi fields don't have fallback synthetic source. @@ -2036,6 +2125,17 @@ public BlockLoader blockLoader(BlockLoaderContext blContext) { return type.blockLoaderFromSource(sourceValueFetcher(blContext.sourcePaths(name()), blContext.indexSettings()), lookup); } + @Override + public boolean supportsBlockLoaderConfig(BlockLoaderFunctionConfig config, FieldExtractPreference preference) { + if (hasDocValues() && (preference != FieldExtractPreference.STORED || isSyntheticSource)) { + return switch (config.function()) { + case MV_MAX, MV_MIN -> true; + default -> false; + }; + } + return false; + } + @Override public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) { FielddataOperation operation = fieldDataContext.fielddataOperation(); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/BlockLoaderFunctionConfig.java b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/BlockLoaderFunctionConfig.java index fe58451988779..4ac18ab0674ce 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/blockloader/BlockLoaderFunctionConfig.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/blockloader/BlockLoaderFunctionConfig.java @@ -26,6 +26,8 @@ public interface BlockLoaderFunctionConfig { */ Function function(); + record JustFunction(Function function) implements BlockLoaderFunctionConfig {} + record JustWarnings(Function function, Warnings warnings) implements BlockLoaderFunctionConfig { // Consider just the function, as warnings will have Source that differ for different invocations of the same function @@ -43,6 +45,8 @@ public int hashCode() { } enum Function { + MV_MAX, + MV_MIN, LENGTH, V_COSINE, V_DOT_PRODUCT, diff --git a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/AggregatorMode.java b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/AggregatorMode.java index 706dcd02ed1ce..67e8e2819f87d 100644 --- a/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/AggregatorMode.java +++ b/x-pack/plugin/esql/compute/src/main/java/org/elasticsearch/compute/aggregation/AggregatorMode.java @@ -7,6 +7,32 @@ package org.elasticsearch.compute.aggregation; +import org.elasticsearch.compute.data.Block; +import org.elasticsearch.compute.operator.topn.TopNOperator; + +/** + * "Modes" for running an aggregate function. + *

+ * Aggregations running on a single "stream" of {@link Block}s should run in + * {@link #SINGLE} mode. This works for aggs that come after a + * {@link TopNOperator} or another agg. + *

+ *

+ * But all other aggregations run distributed. On many threads on each data node + * we run in {@link #INITIAL} mode to consume raw data and output just enough to + * finish the job later. All threads on a node dump the data into the same agg + * run in {@link #INTERMEDIATE} mode to perform "node reduction". Then, on the + * coordinating node, the outputs of the "node reduction" goes into the agg in + * {@link #FINAL} mode. + *

+ *

+ * Put another way, all data must flow throw aggregations in one of these two sequences: + *

+ * + */ public enum AggregatorMode { /** diff --git a/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushExpressionToLoadIT.java b/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushExpressionToLoadIT.java index 05e22cbcaa124..39a90f42c5785 100644 --- a/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushExpressionToLoadIT.java +++ b/x-pack/plugin/esql/qa/server/single-node/src/javaRestTest/java/org/elasticsearch/xpack/esql/qa/single_node/PushExpressionToLoadIT.java @@ -45,6 +45,7 @@ import static org.elasticsearch.xpack.esql.qa.single_node.RestEsqlIT.commonProfile; import static org.elasticsearch.xpack.esql.qa.single_node.RestEsqlIT.fixTypesOnProfile; import static org.hamcrest.Matchers.any; +import static org.hamcrest.Matchers.closeTo; import static org.hamcrest.Matchers.instanceOf; import static org.hamcrest.Matchers.startsWith; @@ -83,6 +84,11 @@ public void testLengthToKeyword() throws IOException { ); } + /** + * We don't support fusing {@code LENGTH} into loading {@code wildcard} fields because + * we haven't written support for fusing functions to loading from its source format. + * We haven't done that because {@code wildcard} fields aren't super common. + */ public void testLengthNotPushedToWildcard() throws IOException { String value = "v".repeat(between(0, 256)); test( @@ -94,6 +100,12 @@ public void testLengthNotPushedToWildcard() throws IOException { ); } + /** + * We don't support fusing {@code LENGTH} into loading {@code text} fields because + * we haven't written support for fusing functions to loading from {@code _source}. + * Usually folks that want to go superfast will use doc values. But those aren't + * even available for {@code text} fields. + */ public void testLengthNotPushedToText() throws IOException { String value = "v".repeat(between(0, 256)); test( @@ -107,6 +119,222 @@ public void testLengthNotPushedToText() throws IOException { ); } + public void testMvMinToKeyword() throws IOException { + String min = "a".repeat(between(1, 256)); + String max = "b".repeat(between(1, 256)); + test( + justType("keyword"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(min), + matchesMap().entry("test:column_at_a_time:MvMinBytesRefsFromOrds.SortedSet", 1) + ); + } + + public void testMvMinToIp() throws IOException { + String min = "192.168.0." + between(0, 255); + String max = "192.168.3." + between(0, 255); + test( + justType("ip"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(min), + matchesMap().entry("test:column_at_a_time:MvMinBytesRefsFromOrds.SortedSet", 1) + ); + } + + public void testMvMinToHalfFloat() throws IOException { + double min = randomDouble(); + double max = 1 + randomDouble(); + test( + justType("half_float"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(closeTo(min, .1)), + matchesMap().entry("test:column_at_a_time:MvMinDoublesFromDocValues.Sorted", 1) + ); + } + + public void testMvMinToFloat() throws IOException { + double min = randomDouble(); + double max = 1 + randomDouble(); + test( + justType("float"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(closeTo(min, .1)), + matchesMap().entry("test:column_at_a_time:MvMinDoublesFromDocValues.Sorted", 1) + ); + } + + public void testMvMinToDouble() throws IOException { + double min = randomDouble(); + double max = 1 + randomDouble(); + test( + justType("double"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(min), + matchesMap().entry("test:column_at_a_time:MvMinDoublesFromDocValues.Sorted", 1) + ); + } + + public void testMvMinToByte() throws IOException { + int min = between(Byte.MIN_VALUE, Byte.MAX_VALUE - 10); + int max = between(min + 1, Byte.MAX_VALUE); + test( + justType("byte"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(min), + matchesMap().entry("test:column_at_a_time:MvMinIntsFromDocValues.Sorted", 1) + ); + } + + public void testMvMinToShort() throws IOException { + int min = between(Short.MIN_VALUE, Short.MAX_VALUE - 10); + int max = between(min + 1, Short.MAX_VALUE); + test( + justType("short"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(min), + matchesMap().entry("test:column_at_a_time:MvMinIntsFromDocValues.Sorted", 1) + ); + } + + public void testMvMinToInt() throws IOException { + int min = between(Integer.MIN_VALUE, Integer.MAX_VALUE - 10); + int max = between(min + 1, Integer.MAX_VALUE); + test( + justType("integer"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(min), + matchesMap().entry("test:column_at_a_time:MvMinIntsFromDocValues.Sorted", 1) + ); + } + + public void testMvMinToLong() throws IOException { + long min = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE - 10); + long max = randomLongBetween(min + 1, Long.MAX_VALUE); + test( + justType("long"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MIN(test)", + matchesList().item(min), + matchesMap().entry("test:column_at_a_time:MvMinLongsFromDocValues.Sorted", 1) + ); + } + + public void testMvMaxToKeyword() throws IOException { + String min = "a".repeat(between(1, 256)); + String max = "b".repeat(between(1, 256)); + test( + justType("keyword"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(max), + matchesMap().entry("test:column_at_a_time:MvMaxBytesRefsFromOrds.SortedSet", 1) + ); + } + + public void testMvMaxToIp() throws IOException { + String min = "192.168.0." + between(0, 255); + String max = "192.168.3." + between(0, 255); + test( + justType("ip"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(max), + matchesMap().entry("test:column_at_a_time:MvMaxBytesRefsFromOrds.SortedSet", 1) + ); + } + + public void testMvMaxToByte() throws IOException { + int min = between(Byte.MIN_VALUE, Byte.MAX_VALUE - 10); + int max = between(min + 1, Byte.MAX_VALUE); + test( + justType("byte"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(max), + matchesMap().entry("test:column_at_a_time:MvMaxIntsFromDocValues.Sorted", 1) + ); + } + + public void testMvMaxToShort() throws IOException { + int min = between(Short.MIN_VALUE, Short.MAX_VALUE - 10); + int max = between(min + 1, Short.MAX_VALUE); + test( + justType("short"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(max), + matchesMap().entry("test:column_at_a_time:MvMaxIntsFromDocValues.Sorted", 1) + ); + } + + public void testMvMaxToInt() throws IOException { + int min = between(Integer.MIN_VALUE, Integer.MAX_VALUE - 10); + int max = between(min + 1, Integer.MAX_VALUE); + test( + justType("integer"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(max), + matchesMap().entry("test:column_at_a_time:MvMaxIntsFromDocValues.Sorted", 1) + ); + } + + public void testMvMaxToLong() throws IOException { + long min = randomLongBetween(Long.MIN_VALUE, Long.MAX_VALUE - 10); + long max = randomLongBetween(min + 1, Long.MAX_VALUE); + test( + justType("long"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(max), + matchesMap().entry("test:column_at_a_time:MvMaxLongsFromDocValues.Sorted", 1) + ); + } + + public void testMvMaxToHalfFloat() throws IOException { + double min = randomDouble(); + double max = 1 + randomDouble(); + test( + justType("half_float"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(closeTo(max, .1)), + matchesMap().entry("test:column_at_a_time:MvMaxDoublesFromDocValues.Sorted", 1) + ); + } + + public void testMvMaxToFloat() throws IOException { + double min = randomDouble(); + double max = 1 + randomDouble(); + test( + justType("float"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(closeTo(max, .1)), + matchesMap().entry("test:column_at_a_time:MvMaxDoublesFromDocValues.Sorted", 1) + ); + } + + public void testMvMaxToDouble() throws IOException { + double min = randomDouble(); + double max = 1 + randomDouble(); + test( + justType("double"), + b -> b.startArray("test").value(min).value(max).endArray(), + "| EVAL test = MV_MAX(test)", + matchesList().item(max), + matchesMap().entry("test:column_at_a_time:MvMaxDoublesFromDocValues.Sorted", 1) + ); + } + public void testVCosine() throws IOException { test( justType("dense_vector"), diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/floats.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/floats.csv-spec index 1128cbc6f4106..b10fcff51fa84 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/floats.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/floats.csv-spec @@ -692,3 +692,142 @@ null null null ; + +rowMvMin + ROW d=[1.1, 2.2]::DOUBLE +| EVAL d=MV_MIN(d) +; + +d:double +1.1 +; + +rowMvMax + ROW d=[1.1, 2.2]::DOUBLE +| EVAL d=MV_MAX(d) +; + +d:double +2.2 +; + +groupMv + FROM employees +| STATS MIN(emp_no) BY salary_change +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:double +10009 | null +10040 | 14.74 +10003 | 14.68 +10023 | 14.63 +10065 | 14.44 +; + +groupMvMin + FROM employees +| STATS MIN(emp_no) BY salary_change=MV_MIN(salary_change) +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:double +10009 | null +10086 | 13.61 +10003 | 12.82 +10015 | 12.4 +10050 | 8.7 +; + +groupMvMax + FROM employees +| STATS MIN(emp_no) BY salary_change=MV_MAX(salary_change) +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:double +10009 | null +10040 | 14.74 +10003 | 14.68 +10023 | 14.63 +10065 | 14.44 +; + +valuesMvMinDouble + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MIN(VALUES(salary_change)) BY first_letter +| SORT first_letter ASC +; + +MV_MIN(VALUES(salary_change)):double | first_letter:keyword +-3.9 | A +-7.23 | B +-0.35 | C +1.19 | G +-2.14 | K +12.82 | P +-2.92 | S +-7.06 | T +; + +valuesMvMaxDouble + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MAX(VALUES(salary_change)) BY first_letter +| SORT first_letter ASC +; + +MV_MAX(VALUES(salary_change)):double | first_letter:keyword +-3.9 | A +11.17 | B +13.48 | C +1.19 | G +13.07 | K +14.68 | P +12.68 | S +1.99 | T +; + +valuesMvMinFloat + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MIN(VALUES(height.float)) BY first_letter +| SORT first_letter ASC +; + +MV_MIN(VALUES(height.float)):double | first_letter:keyword +1.559999942779541 | A +2.0799999237060547 | B +1.7799999713897705 | C +2.0299999713897705 | G +2.049999952316284 | K +1.8300000429153442 | P +1.850000023841858 | S +1.7000000476837158 | T +; + +valuesMvMaxFloat + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MAX(VALUES(height.float)) BY first_letter +| SORT first_letter ASC +; + +MV_MAX(VALUES(height.float)):double | first_letter:keyword +1.559999942779541 | A +2.0799999237060547 | B +1.7799999713897705 | C +2.0299999713897705 | G +2.049999952316284 | K +1.8300000429153442 | P +2.0999999046325684 | S +1.7000000476837158 | T +; diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/ints.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/ints.csv-spec index f4b6d41a7a027..8a229f2f841b3 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/ints.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/ints.csv-spec @@ -1003,3 +1003,282 @@ emp_no:integer |salary_change.int:integer 10079 | 7 10086 | 13 ; + +rowMvMinLong + ROW l=[1, 2]::LONG +| EVAL l=MV_MIN(l) +; + +l:long +1 +; + +rowMvMaxLong + ROW l=[1, 2]::LONG +| EVAL l=MV_MAX(l) +; + +l:long +2 +; + +rowMvMinInt + ROW i=[1, 2]::INT +| EVAL i=MV_MIN(i) +; + +i:integer +1 +; + +rowMvMaxInt + ROW i=[1, 2]::INT +| EVAL i=MV_MAX(i) +; + +i:integer +2 +; + +groupMvLong + FROM employees +| STATS MIN(emp_no) BY salary_change=salary_change.long +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:long +10009 | null +10003 | 14 +10004 | 13 +10003 | 12 +10002 | 11 +; + +groupMvMinLong + FROM employees +| STATS MIN(emp_no) BY salary_change=MV_MIN(salary_change.long) +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:long +10009 | null +10086 | 13 +10003 | 12 +10044 | 8 +10079 | 7 +; + +groupMvMaxLong + FROM employees +| STATS MIN(emp_no) BY salary_change=MV_MAX(salary_change.long) +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:long +10009 | null +10003 | 14 +10004 | 13 +10008 | 12 +10002 | 11 +; + +groupMvInt + FROM employees +| STATS MIN(emp_no) BY salary_change=salary_change.int +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:integer +10009 | null +10003 | 14 +10004 | 13 +10003 | 12 +10002 | 11 +; + +groupMvMinInt + FROM employees +| STATS MIN(emp_no) BY salary_change=MV_MIN(salary_change.int) +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:integer +10009 | null +10086 | 13 +10003 | 12 +10044 | 8 +10079 | 7 +; + +groupMvMaxInt + FROM employees +| STATS MIN(emp_no) BY salary_change=MV_MAX(salary_change.int) +| SORT salary_change DESC +| LIMIT 5 +; + +MIN(emp_no):integer | salary_change:integer +10009 | null +10003 | 14 +10004 | 13 +10008 | 12 +10002 | 11 +; + +valuesMvMinLong + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MIN(VALUES(languages.long)) BY first_letter +| SORT first_letter ASC +; + +MV_MIN(VALUES(languages.long)):long | first_letter:keyword +3 | A +5 | B +5 | C +2 | G +1 | K +4 | P +1 | S +4 | T +; + +valuesMvMaxLong + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MAX(VALUES(languages.long)) BY first_letter +| SORT first_letter ASC +; + +MV_MAX(VALUES(languages.long)):long | first_letter:keyword +3 | A +5 | B +5 | C +2 | G +1 | K +4 | P +2 | S +4 | T +; + +valuesMvMinInt + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MIN(VALUES(languages)) BY first_letter +| SORT first_letter ASC +; + +MV_MIN(VALUES(languages)):integer | first_letter:keyword +3 | A +5 | B +5 | C +2 | G +1 | K +4 | P +1 | S +4 | T +; + +valuesMvMaxInt + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MAX(VALUES(languages)) BY first_letter +| SORT first_letter ASC +; + +MV_MAX(VALUES(languages)):integer | first_letter:keyword +3 | A +5 | B +5 | C +2 | G +1 | K +4 | P +2 | S +4 | T +; + +valuesMvMinShort + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MIN(VALUES(languages.short)) BY first_letter +| SORT first_letter ASC +; + +MV_MIN(VALUES(languages.short)):integer | first_letter:keyword +3 | A +5 | B +5 | C +2 | G +1 | K +4 | P +1 | S +4 | T +; + +valuesMvMaxShort + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MAX(VALUES(languages.short)) BY first_letter +| SORT first_letter ASC +; + +MV_MAX(VALUES(languages.short)):integer | first_letter:keyword +3 | A +5 | B +5 | C +2 | G +1 | K +4 | P +2 | S +4 | T +; + + +valuesMvMinByte + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MIN(VALUES(languages.byte)) BY first_letter +| SORT first_letter ASC +; + +MV_MIN(VALUES(languages.byte)):integer | first_letter:keyword +3 | A +5 | B +5 | C +2 | G +1 | K +4 | P +1 | S +4 | T +; + +valuesMvMaxByte + FROM employees +| WHERE emp_no <= 10009 +| EVAL first_letter = SUBSTRING(first_name, 0, 1) +| STATS MV_MAX(VALUES(languages.byte)) BY first_letter +| SORT first_letter ASC +; + +MV_MAX(VALUES(languages.byte)):integer | first_letter:keyword +3 | A +5 | B +5 | C +2 | G +1 | K +4 | P +2 | S +4 | T +; diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/ip.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/ip.csv-spec index 6f83b54606e05..615c90f2e2bb1 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/ip.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/ip.csv-spec @@ -572,6 +572,39 @@ required_capability: agg_values fe80::cae2:65ff:fece:feb9 | gamma ; +valuesGroupedMvMin +required_capability: agg_values + + FROM hosts +| EVAL host=SUBSTRING(host, 0, 1) +| STATS ip0=MV_MIN(VALUES(ip0)) BY host +| SORT host +; + + ip0:ip | host:keyword + ::1 | a + 127.0.0.1 | b +fe80::cae2:65ff:fece:feb9 | e +fe80::cae2:65ff:fece:feb9 | g +; + +valuesGroupedMvMax +required_capability: agg_values + + FROM hosts +| EVAL host=SUBSTRING(host, 0, 1) +| STATS ip0=MV_MAX(VALUES(ip0)) BY host +| SORT host +; + + ip0:ip | host:keyword + 127.0.0.1 | a + 127.0.0.1 | b +fe82::cae2:65ff:fece:fec0 | e +fe80::cae2:65ff:fece:feb9 | g + +; + implictCastingEqual required_capability: string_literal_auto_casting_extended from hosts | where mv_first(ip0) == "127.0.0.1" | keep host, ip0 | sort host; @@ -823,3 +856,67 @@ warning:Line 2:20: java.lang.IllegalArgumentException: 'invalid_network' is not ip0:ip |ip1:ip |direction:keyword 127.0.0.1 |8.8.8.8 |null ; + +mvMinRow + ROW ip=["192.168.0.1", "10.10.0.1"]::IP +| EVAL ip=MV_MIN(ip) +; + + ip:ip +10.10.0.1 +; + +mvMaxRow + ROW ip=["192.168.0.1", "10.10.0.1"]::IP +| EVAL ip=MV_MAX(ip) +; + + ip:ip +192.168.0.1 +; + +groupMv + FROM hosts +| STATS COUNT(*) BY ip0=ip0 +| SORT ip0 ASC +; + +COUNT(*):long | ip0:ip +1 | ::1 +4 | 127.0.0.1 +3 | fe80::cae2:65ff:fece:feb9 +1 | fe80::cae2:65ff:fece:fec0 +1 | fe80::cae2:65ff:fece:fec1 +1 | fe81::cae2:65ff:fece:feb9 +1 | fe82::cae2:65ff:fece:fec0 +1 | null +; + +groupMvMin + FROM hosts +| STATS COUNT(*) BY ip0=MV_MIN(ip0) +| SORT ip0 ASC +; + +COUNT(*):long | ip0:ip +1 | ::1 +4 | 127.0.0.1 +3 | fe80::cae2:65ff:fece:feb9 +1 | fe81::cae2:65ff:fece:feb9 +1 | null +; + +groupMvMax + FROM hosts +| STATS COUNT(*) BY ip0=MV_MAX(ip0) +| SORT ip0 ASC +; + +COUNT(*):long | ip0:ip +1 | ::1 +4 | 127.0.0.1 +2 | fe80::cae2:65ff:fece:feb9 +1 | fe80::cae2:65ff:fece:fec1 +1 | fe82::cae2:65ff:fece:fec0 +1 | null +; diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec index b0bd91373e002..9c7765584d9c9 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec @@ -672,6 +672,34 @@ min(salary):integer | max(salary):integer | job_positions:keyword 25324 | 58715 | Head Human Resources ; +groupByMvMin + FROM employees +| STATS MIN(salary), MAX(salary) BY job_positions=MV_MIN(job_positions) +| SORT job_positions +| LIMIT 5; + +MIN(salary):integer | MAX(salary):integer | job_positions:keyword +25976 | 74970 | Accountant +28941 | 69904 | Architect +29175 | 50249 | Business Analyst +25945 | 74999 | Data Scientist +25324 | 50064 | Head Human Resources +; + +groupByMvMax + FROM employees +| STATS MIN(salary), MAX(salary) BY job_positions=MV_MAX(job_positions) +| SORT job_positions +| LIMIT 5; + +MIN(salary):integer | MAX(salary):integer | job_positions:keyword +47411 | 47411 | Accountant +28941 | 28941 | Architect +39110 | 48942 | Head Human Resources +26436 | 50128 | Internship +25976 | 64675 | Junior Developer +; + convertFromString from employees | sort emp_no | eval positions = to_string(job_positions) | keep emp_no, positions, job_positions | limit 5; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/AggregateFunction.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/AggregateFunction.java index 097bbaae99aa7..7e04054d95ef0 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/AggregateFunction.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/aggregate/AggregateFunction.java @@ -41,6 +41,23 @@ * - Aggregate functions can have an optional filter and window, which default to {@code Literal.TRUE} and {@code NO_WINDOW}. * - The aggregation function should be composed as: source, field, filter, window, parameters. * Extra parameters should go to the parameters after the filter and window. + *

+ * These function appear only in special places in the language that expect to take many inputs + * and produce one output per group key: + *

+ * + *

+ * They always process many input rows to produce their values. If they are built + * without a {@code BY} they produce a single value as output. If they are built + * with a {@code BY} they produce one value per group key as output. + *

+ *

+ * See {@link org.elasticsearch.compute.aggregation.AggregatorMode} for important + * information about their execution lifecycle. + *

*/ public abstract class AggregateFunction extends Function implements PostAnalysisPlanVerificationAware { public static final Literal NO_WINDOW = Literal.timeDuration(Source.EMPTY, Duration.ZERO); diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/blockloader/BlockLoaderExpression.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/blockloader/BlockLoaderExpression.java index 9325c3fec9032..53d5039da5e1e 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/blockloader/BlockLoaderExpression.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/blockloader/BlockLoaderExpression.java @@ -9,21 +9,27 @@ import org.elasticsearch.compute.data.Block; import org.elasticsearch.core.Nullable; +import org.elasticsearch.index.mapper.BlockLoader; +import org.elasticsearch.index.mapper.MappedFieldType; import org.elasticsearch.index.mapper.blockloader.BlockLoaderFunctionConfig; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMaxBytesRefsFromOrdsBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.MvMaxLongsFromDocValuesBlockLoader; +import org.elasticsearch.index.mapper.blockloader.docvalues.fn.Utf8CodePointsFromOrdsBlockLoader; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; +import org.elasticsearch.xpack.esql.expression.function.scalar.EsqlScalarFunction; import org.elasticsearch.xpack.esql.stats.SearchStats; /** * {@link Expression} that can be "pushed" into value loading. Most of the time * we load values into {@link Block}s and then run the expressions on them, but * sometimes it's worth short-circuiting this process and running the expression - * in the tight loop we use for loading: + * in the tight loop we use for loading values. * + *

+ * See the docs for {@link EsqlScalarFunction} for how this optimization fits in with + * all the other optimizations we've implemented. + *

+ *

How to implement

+ *
    + *
  1. Implement some block loaders
  2. + *
  3. Unit test the block loaders
  4. + *
  5. Plug the {@link BlockLoader} into the {@link MappedFieldType#blockLoader field mapper}
  6. + *
  7. Implement this interface
  8. + *
  9. Add to {@code PushExpressionToLoadIT}
  10. + *
  11. Maybe add to {@code csv-spec} tests
  12. + *
  13. Get some performance numbers and open a PR
  14. + *
+ *

Implement some block loaders

+ *

+ * Implement a {@link BlockLoader} for each fused code path. There's + * going to be a {@linkplain BlockLoader} per + * {@code x x }. Examples: + *

+ *
    + *
  1. + * {@link Utf8CodePointsFromOrdsBlockLoader} is for {@code LENGTH x keyword x docValues}. + *
  2. + *
  3. + * {@link MvMaxLongsFromDocValuesBlockLoader} is for {@code MV_MAX x long x docValues}. + *
  4. + *
  5. + * {@link MvMaxBytesRefsFromOrdsBlockLoader} is for {@code MV_MAX x (keyword|ip) x doc_values}. + *
  6. + *
+ *

+ * If you wanted to push all loads for a function applied + * to a field type you'd need to optimize all paths which could include: + *

+ *
    + *
  1. {@code doc_values}
  2. + *
  3. {@code stored}
  4. + *
  5. {@code _source}
  6. + *
  7. Funky synthetic {@code _source} cases
  8. + *
  9. Using the search index
  10. + *
+ *

+ * Unless you have a good reason to do otherwise, it's generally fine to start with + * {@code doc_values}. And it might be fine to only implement this fusion + * for {@code doc_values}. Usually, loading {@code stored} fields + * and loading from {@code _source} is so slow that this optimization won't buy you + * much speed proportionally. But this is only a rule of thumb. + * The first extraction push down we implemented violates the rule! It was directly + * to the search index for vector fields. + *

+ *

+ * Note: The {@link Object#toString}s are important in these classes. We expose them + * over the {@code profile} API and use them for tests later on. + *

+ *

Unit test the block loaders

+ *

+ * Build a randomized unit test that + *

+ *
    + *
  1. loads random data
  2. + *
  3. loads using both your new {@link BlockLoader} and the non-fused loader
  4. + *
  5. compares the results
  6. + *
+ *

+ * See the test for {@link Utf8CodePointsFromOrdsBlockLoader} for an example. These tests + * are usually quite parameterized to make sure we cover things like: + *

+ * + *

+ * These unit tests cover a ton of different configurations quickly, and we + * know that we're using the loader. + *

+ *

Plug the {@link BlockLoader} into the {@link MappedFieldType#blockLoader field mapper}

+ *

+ * You must implement: + *

+ * + *

Implement this interface

+ *

+ * Implement {@link BlockLoaderExpression}. Generally it's enough to check that + * check if the function is being applied to a {@link FieldAttribute} and do something + * like: + *

+ *
{@code
+ *         if (field instanceof FieldAttribute f && f.dataType() == DataType.KEYWORD) {
+ *             return new PushedBlockLoaderExpression(f, BlockLoaderFunctionConfig.Function.WHATEVER);
+ *         }
+ *         return null;
+ * }
+ *

+ * The rules system will check {@link MappedFieldType#supportsBlockLoaderConfig} for you. + * See the docs for {@link #tryPushToFieldLoading} for more on how to implement it. + *

+ *

Add to {@code PushExpressionToLoadIT}

+ *

+ * Add a case or two to {@code PushExpressionToLoadIT} to prove that we've plugged + * everything in properly. These tests make sure that we're really loading the data + * really using your new {@linkplain BlockLoader}. This is where your nice + * {@link Object#toString}s come into play. That's the key into the profile map that + * shows that your new {@linkplain BlockLoader} is plugged in. + *

+ *

Maybe add to {@code csv-spec} tests

+ *

+ * Look for your function in the csv-spec tests and make sure there are cases that + * contain your function processing each data type you are pushing. For each type, + * make sure the function processes the results of: + *

+ * + *

+ * It's fairly likely we already have tests for all these cases. + * They are part of our standard practice for adding functions, but there are a lot + * of them, and we may have forgotten some. And, without the pushdown you are + * implementing, they are mostly there for healthy paranoia around rules and + * a hedge against mistakes implementing optimizations in the future. Like the + * optimization you are implementing now! + *

+ *

+ * Anyway, once there are plenty of these tests you should run them via the ESQL + * unit tests and via the single-node integration tests. These tests don't prove + * that your new {@linkplain BlockLoader}s are plugged in. You have + * {@code PushExpressionToLoadIT} for that. Instead, they prove that, when your + * new {@linkplain BlockLoader} is plugged in, it produces + * correct output. So, just like your unit test, but integrated with the entire + * rest of the world. + *

+ *

Get some performance numbers and open a PR

+ *

+ * Now that you can be pretty sure everything is plugged in and working you can + * get some performance numbers. It's generally good to start with a quick and + * dirty script. + * These should show you a performance improvement, and you can use the + * {@code profile} API as a final proof that everything is plugged in. Once that + * looks right you should generally be ok to open a PR. Attach the results of + * your bash script to prove that it's faster. + *

+ *

+ * Next, look for a rally track + * that should improve with your PR. If you find one, and it's + * in the nightlies already, then you have a choice: + *

+ * + *

+ * If the quick and dirty perf testing looked good you are probably safe waiting on + * the nightlies. You should look for them in + * benchmarks.elastic.co. + *

+ *

+ * If there isn't already a rally operation then you should add one like this + * PR. How you add + * one of these and how you get it into the nightlies and whether it should be in + * the nightlies is outside the scope of this document. + *

*/ public interface BlockLoaderExpression { /** @@ -66,5 +250,9 @@ public interface BlockLoaderExpression { * @param field the field whose load we're fusing into * @param config the expression's configuration */ - record PushedBlockLoaderExpression(FieldAttribute field, BlockLoaderFunctionConfig config) {} + record PushedBlockLoaderExpression(FieldAttribute field, BlockLoaderFunctionConfig config) { + public PushedBlockLoaderExpression(FieldAttribute field, BlockLoaderFunctionConfig.Function function) { + this(field, new BlockLoaderFunctionConfig.JustFunction(function)); + } + } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/EsqlScalarFunction.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/EsqlScalarFunction.java index 85d15f82f458a..fd76147a899cd 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/EsqlScalarFunction.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/EsqlScalarFunction.java @@ -7,23 +7,306 @@ package org.elasticsearch.xpack.esql.expression.function.scalar; +import org.elasticsearch.common.time.DateFormatter; +import org.elasticsearch.compute.ann.ConvertEvaluator; +import org.elasticsearch.compute.ann.Evaluator; +import org.elasticsearch.compute.ann.MvEvaluator; +import org.elasticsearch.compute.data.Block; +import org.elasticsearch.compute.data.Page; +import org.elasticsearch.compute.data.Vector; +import org.elasticsearch.compute.lucene.LuceneCountOperator; +import org.elasticsearch.compute.operator.topn.TopNOperator; +import org.elasticsearch.index.mapper.BlockLoader; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.FoldContext; import org.elasticsearch.xpack.esql.core.expression.function.scalar.ScalarFunction; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.evaluator.mapper.EvaluatorMapper; +import org.elasticsearch.xpack.esql.expression.function.blockloader.BlockLoaderExpression; +import org.elasticsearch.xpack.esql.optimizer.rules.physical.local.PushTopNToSource; import java.util.List; /** - * A {@code ScalarFunction} is a {@code Function} that takes values from some - * operation and converts each to another value. An example would be - * {@code ABS()}, which takes one value at a time, applies a function to the - * value (abs) and returns a new value. + * A {@code ScalarFunction} is a {@code Function} that makes one output value per + * input row. It operates on a whole {@link Page} of inputs at a time, building + * a {@link Block} of results. + *

+ * You see them in the language everywhere: + *

+ * + *

+ * Let's work the example of {@code CONCAT("foo ", message)}. It's called with a Page + * of inputs and resolves both of its parameters, yielding a constant block containing + * "foo " and a Block of strings containing {@code message}. It can expect to receive + * thousands of {@code message} values in that block. Then it builds and returns the block + * {@code "foo "}. + *

+ *
{@code
+ *   foo | message | result
+ *   --- | ------- | ----------
+ *   foo | bar     | foo bar
+ *   foo | longer  | foo longer
+ *   ... a thousand rows ...
+ *   foo | baz     | foo baz
+ * }
+ *

+ * It does this once per input Page. + *

*

* We have a guide for writing these in the javadoc for * {@link org.elasticsearch.xpack.esql.expression.function.scalar}. *

+ *

Optimizations

+ *

+ * Scalars are a huge part of the language, and we have a ton of + * different classes of optimizations for them that exist on a performance spectrum: + *

+ *
{@code
+ *  Better         Load Less and
+ * than O(rows)     Run Faster               Run Faster                 Page-at-a-time     Tuple-at-a-time
+ *     |----------------|-------------------------|------------------------------|-------------------|
+ *     ^  ^  ^     ^    ^      ^                  ^           ^    ^   ^     ^   ^      ^            ^
+ *    CF LT ET    FP   BL     MBL                SE          NO  SIMD RR    VD EVAL    EVE         CASE
+ * }
+ *

{@code CF}: Constant Folding

+ *
{@code
+ *   | EVAL a = CONCAT("some ", "words")
+ * }
+ *

+ * The fastest way to run a scalar, now and forever, is to run it at compile time. Turn it + * into a constant and propagate it throughout the query. This is called "constant folding" + * and all scalars, when their arguments are constants, are "folded" to a constant. + *

+ *

{@code LT}: Lucene's TopN

+ *
{@code
+ *     FROM index METADATA _score
+ *   | WHERE title:"cat"
+ *   | SORT _score DESC
+ *   | LIMIT 10
+ * }
+ *
{@code
+ *     FROM index
+ *   | EVAL distance = ST_DISTANCE(point, "POINT(12.5683 55.6761)")
+ *   | SORT distance ASC
+ *   | LIMIT 10
+ * }
+ *

+ * Fundamentally, Lucene is a tuple-at-a-time engine that flows the + * min-competitive + * sort key back into the index iteration process, allowing it to skip huge swaths of + * documents. It has quite a few optimizations that soften the blow of it being + * tuple-at-a-time, so these days "push to a lucene topn" is the fastest way you are going + * to run a scalar function. For that to work it has to be a {@code SORT} key and all the + * filters have to be pushable to lucene and lucene has to know how to run the function + * natively. See {@link PushTopNToSource}. + *

+ *

{@code ET}: Engine TopN (HYPOTHETICAL)

+ *
{@code
+ *     FROM index METADATA _score
+ *   | WHERE title:"cat"
+ *   | WHERE a < j + LENGTH(candy) // <--- anything un-pushable
+ *   | SORT _score DESC
+ *   | LIMIT 10
+ * }
+ *

+ * If ESQL's {@link TopNOperator} exposed the min-competitive information (see above), and + * we fed it back into the lucene query operators then we too could do better than + * {@code O(matching_rows)} for queries sorting on the results of a scalar. This is like + * the {@code LT} but without as many limitations. Lucene has a 20-year head start on us + * optimizing TopN, so we should continue to use them when + * See issue. + *

+ *

{@code BL}: Push to {@link BlockLoader}

+ *
{@code
+ *     FROM index
+ *   | EVAL s = V_COSINE(dense_vector, [0, 1, 2])
+ *   | SORT s desc
+ *   | LIMIT 10
+ * }
+ *
{@code
+ *     FROM index
+ *   | STATS SUM(LENGTH(message)) // Length is pushed to the BlockLoader
+ * }
+ *

+ * Some functions can take advantage of the on-disk structures to run very fast and should be + * "fused" into field loading using {@link BlockLoaderExpression}. Functions like {@code V_COSINE} + * can use the vector search index to compute the result. Functions like {@code MV_MIN} can + * use the {@code doc_values} encoding mechanism to save a ton of work. Functions like the + * upcoming {@code ST_SIMPLIFY} benefit from this by saving huge numbers of allocations even + * if they can't link into the {@code doc_values} format. We do this by building a + * {@link BlockLoader} for each {@code FUNCTION x FIELD_TYPE x storage mechanism} combination + * so we can get as much speed as possible. + *

+ *

{@code MBL}: Push to a "mother ship" {@link BlockLoader} (HYPOTHETICAL)

+ *
{@code
+ *     FROM index
+ *   | STATS SUM(LENGTH(message)), // All of these are pushed to a single BlockLoader
+ *           SUM(SUBSTRING(message, 0, 4)),
+ *        BY trail = SUBSTRING(message, 10, 3)
+ * }
+ *

+ * Pushing functions to a {@link BlockLoader} can involve building a ton + * of distinct {@link BlockLoader}s. Which involves a ton of code and testing and, well, work. + * But it's worth it if you are applying a single function to a field and every single cycle + * counts. Both of these cry out for a more OO-style solution where you build a "mother ship" + * {@linkplain BlockLoader} that operates on, say {@code FIELD_TYPE x storage mechanism} and + * then runs a list of {@code FUNCTION} operations. In some cases this is a bad idea, which + * is why we haven't built it yet. But in plenty of cases it's fine. And, sometimes, we should + * be fine skipping the special purpose block loader in favor of the mother ship. We'd spent + * a few more cycles on each load, but the maintenance advantage is likely worth it for some + * functions. + *

+ *

{@code EVAL}: Page-at-a-time evaluation

+ *

+ * ESQL evaluates whole pages at once, generally walking a couple of arrays in parallel building + * a result array. This makes which bits are the "hot path" very obvious - they are the loops + * that walk these arrays. We put the "slower" stuff outside those loops: + *

+ * + *

{@code VD}: Vector Dispatch

+ *

+ * In Elasticsearch it's normal for fields to sometimes be {@code null} or multivalued. + * There are no constraints on the schema preventing this and, as a search engine, it's + * pretty normal to model things as multivalued fields. We rarely know that a field can + * only be single-valued when we're planning a query. + *

+ *

+ * It's much faster to run a scalar when we know that all of its inputs + * are single valued and non-null. So every scalar function that uses the code generation + * keyed by the {@link Evaluator}, {@link ConvertEvaluator}, and {@link MvEvaluator} + * annotations builds two paths: + *

+ * + *

{@code NO}: Native Ordinal Evaluation

+ *
{@code
+ *     FROM index
+ *   | STATS MAX(foo) BY TO_UPPER(verb)
+ * }
+ *

+ * {@code keyword} and {@code ip} fields load their {@code byte[]} shaped values as a + * lookup table, called "ordinals" because Lucene uses that word for it. Some of our functions, + * like {@code TO_UPPER}, process the lookup table itself instead of processing each position. + * This is especially important when grouping on the field because the hashing done by the + * aggregation code also operates on the lookup table. + *

+ *

{@code SE}: Sorted Execution

+ *
{@code
+ *     FROM index
+ *   | STATS SUM(MV_DEDUPE(file_size))
+ * }
+ *

+ * Some functions can operate on multivalued fields much faster if their inputs are sorted. And + * inputs loaded from {@code doc_values} are sorted by default. Sometimes even sorted AND + * deduplicated. We store this information on each block in {@link Block.MvOrdering}. + *

+ *

+ * NOTE: Functions that can take advantage of this sorting also tend to be NOOPs for + * single-valued inputs. So they benefit hugely from "Vector Dispatch". + *

+ *

{@code SIMD}: Single Instruction Multiple Data instructions

+ *
{@code
+ *     FROM index
+ *   | STATS MAX(lhs + rhs)
+ * }
+ *

+ * Through a combination of "Page-at-a-time evaluation", and "Vector Dispatch" we often + * end up with at least one path that can be turned into a sequence of + * SIMD + * instructions. These are about as fast as you can go and still be `O(matching_rows)`. + * A lot of scalars don't lend themselves perfectly to SIMD, but we make sure those + * that do can take that route. + *

+ *

{@code RR}: Range Rewrite

+ *
{@code
+ *     FROM index
+ *   | STATS COUNT(*) BY DATE_TRUNC(1 DAY, @timestamp)
+ * }
+ *

+ * Functions like {@code DATE_TRUNC} can be quite slow, especially when they are using a + * time zone. It can be much faster if it knows the range of dates that it's operating on. + * And we do know that on the data node! We use that information to rewrite the possibly-slow + * {@code DATE_TRUNC} to the always fast {@code ROUND_TO}, which rounds down to fixed rounding + * points. + *

+ *

+ * At the moment this is only done for {@code DATE_TRUNC} which is a very common function, + * but is technically possible for anything that could benefit from knowing the range up front. + *

+ *

{@code FP}: Filter Pushdown

+ *
{@code
+ *     FROM index
+ *   | STATS COUNT(*) BY DATE_TRUNC(1 DAY, @timestamp)
+ * }
+ *

+ * If the "Range Rewrite" optimization works, we can sometimes further push the resulting + * {@code ROUND_TO} into a sequence of filters. If you are just counting + * documents then this can use the {@link LuceneCountOperator} which can count the number of + * matching documents directly from the cache, technically being faster than + * {@code O(num_hits)}, but only in ideal circumstances. If we can't push the count then it's + * still very very fast. See PR. + *

+ *

{@code EVE}: Expensive Variable Evaluator

+ *
{@code
+ *     FROM index
+ *   | EVAL ts = DATE_PARSE(SUBSTRING(message, 1, 10), date_format_from_the_index)
+ * }
+ *

+ * Functions like {@code DATE_PARSE} need to build something "expensive" per input row, like + * a {@link DateFormatter}. But, often, the expensive thing is constant. In the example above + * the date format comes from the index, but that's quite contrived. These functions generally + * run in the form: + *

+ *
{@code
+ *     FROM index
+ *   | EVAL ts = DATE_PARSE(SUBSTRING(message, 1, 10), "ISO8601")
+ * }
+ *

+ * These generally have special case evaluators that don't construct the format for each row. + * The others are "expensive variable evaluators" and we avoid them when we can. + *

+ *

{@code CASE}: {@code CASE} is evaluated row-by-row

+ *
{@code
+ *     FROM index
+ *   | EVAL f = CASE(d > 0, n / d, 0)
+ * }
+ *
{@code
+ *     FROM index
+ *   | EVAL f = COALESCE(d, 1 / j)
+ * }
+ *

+ * {@code CASE} and {@code COALESCE} short circuit. In the top example above, that + * means we don't run {@code n / d} unless {@code d > 0}. That prevents us from + * emitting warnings for dividing by 0. In the second example, we don't run {@code 1 / j} + * unless {@code d} is null. In the worst case, we manage this by running row-by-row + * which is super slow. Especially because the engine was designed + * for page-at-a-time execution. + *

+ *

+ * In the best case {@code COALESCE} can see that an input is either all-null or + * all-non-null. Then it never falls back to row-by-row evaluation and is quite fast. + *

+ *

+ * {@code CASE} has a similar optimization: For each incoming {@link Page}, if the + * condition evaluates to a constant, then it executes the corresponding "arm" + * Page-at-a-time. Also! If the "arms" are "fast" and can't throw warnings, then + * {@code CASE} can execute "eagerly" - evaluating all three arguments and just + * plucking values back and forth. The "eager" {@code CASE} evaluator is effectively + * the same as any other page-at-a-time evaluator. + *

*/ public abstract class EsqlScalarFunction extends ScalarFunction implements EvaluatorMapper { protected EsqlScalarFunction(Source source) { diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/multivalue/MvMax.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/multivalue/MvMax.java index e25d1662f9cae..98b79ca438ad0 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/multivalue/MvMax.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/multivalue/MvMax.java @@ -13,14 +13,18 @@ import org.elasticsearch.compute.ann.MvEvaluator; import org.elasticsearch.compute.operator.EvalOperator; import org.elasticsearch.compute.operator.EvalOperator.ExpressionEvaluator; +import org.elasticsearch.index.mapper.blockloader.BlockLoaderFunctionConfig; import org.elasticsearch.xpack.esql.EsqlIllegalArgumentException; import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.expression.function.Example; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.Param; +import org.elasticsearch.xpack.esql.expression.function.blockloader.BlockLoaderExpression; import org.elasticsearch.xpack.esql.planner.PlannerUtils; +import org.elasticsearch.xpack.esql.stats.SearchStats; import java.io.IOException; import java.util.List; @@ -31,7 +35,7 @@ /** * Reduce a multivalued field to a single valued field containing the maximum value. */ -public class MvMax extends AbstractMultivalueFunction { +public class MvMax extends AbstractMultivalueFunction implements BlockLoaderExpression { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "MvMax", MvMax::new); @FunctionInfo( @@ -129,4 +133,12 @@ static long process(long current, long v) { static int ascendingIndex(int count) { return count - 1; } + + @Override + public PushedBlockLoaderExpression tryPushToFieldLoading(SearchStats stats) { + if (field instanceof FieldAttribute f) { + return new PushedBlockLoaderExpression(f, BlockLoaderFunctionConfig.Function.MV_MAX); + } + return null; + } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/multivalue/MvMin.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/multivalue/MvMin.java index 75590d5d8b43a..791ab7c4b2301 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/multivalue/MvMin.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/multivalue/MvMin.java @@ -13,14 +13,18 @@ import org.elasticsearch.compute.ann.MvEvaluator; import org.elasticsearch.compute.operator.EvalOperator; import org.elasticsearch.compute.operator.EvalOperator.ExpressionEvaluator; +import org.elasticsearch.index.mapper.blockloader.BlockLoaderFunctionConfig; import org.elasticsearch.xpack.esql.EsqlIllegalArgumentException; import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.FieldAttribute; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.expression.function.Example; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.Param; +import org.elasticsearch.xpack.esql.expression.function.blockloader.BlockLoaderExpression; import org.elasticsearch.xpack.esql.planner.PlannerUtils; +import org.elasticsearch.xpack.esql.stats.SearchStats; import java.io.IOException; import java.util.List; @@ -31,7 +35,7 @@ /** * Reduce a multivalued field to a single valued field containing the minimum value. */ -public class MvMin extends AbstractMultivalueFunction { +public class MvMin extends AbstractMultivalueFunction implements BlockLoaderExpression { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "MvMin", MvMin::new); @FunctionInfo( @@ -129,4 +133,12 @@ static long process(long current, long v) { static int ascendingIndex(int count) { return 0; } + + @Override + public PushedBlockLoaderExpression tryPushToFieldLoading(SearchStats stats) { + if (field instanceof FieldAttribute f) { + return new PushedBlockLoaderExpression(f, BlockLoaderFunctionConfig.Function.MV_MIN); + } + return null; + } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/package-info.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/package-info.java new file mode 100644 index 0000000000000..2ce6017991bbf --- /dev/null +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/package-info.java @@ -0,0 +1,22 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +/** + * {@link org.elasticsearch.xpack.esql.core.expression.Expression Expressions} process values + * to make more values. There are two kinds: + * + */ +package org.elasticsearch.xpack.esql.expression; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/stats/SearchContextStats.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/stats/SearchContextStats.java index a6343ecbf7d67..d5ac62273bff0 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/stats/SearchContextStats.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/stats/SearchContextStats.java @@ -165,6 +165,9 @@ public boolean supportsLoaderConfig( BlockLoaderFunctionConfig config, MappedFieldType.FieldExtractPreference preference ) { + if (config == null) { + throw new UnsupportedOperationException("config must be provided"); + } for (SearchExecutionContext context : contexts) { MappedFieldType ft = context.getFieldType(name.string()); if (ft == null) { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java index 4b7196a34db8d..0804ba1718b53 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/optimizer/LocalLogicalPlanOptimizerTests.java @@ -1749,6 +1749,14 @@ public void testLengthPushdownZoo() { var relation = as(filter.child(), EsRelation.class); assertThat(relation.output(), hasItem(lastNamePushDownAttr)); assertThat(relation.output(), hasItem(firstNamePushDownAttr)); + assertThat(relation.output().stream().filter(a -> { + if (a instanceof FieldAttribute fa) { + if (fa.field() instanceof FunctionEsField fef) { + return fef.functionConfig().function() == BlockLoaderFunctionConfig.Function.LENGTH; + } + } + return false; + }).toList(), hasSize(2)); } public void testLengthInStatsTwice() {