From 98739d72e0c73d6a0e081727638cebee8e737d2f Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Tue, 2 Sep 2025 16:54:45 -0400 Subject: [PATCH 01/29] Add new function to chunk strings --- x-pack/plugin/esql/build.gradle | 2 + x-pack/plugin/esql/compute/build.gradle | 2 + .../compute/src/main/java/module-info.java | 3 +- .../src/main/resources/chunk.csv-spec | 43 ++++ .../scalar/string/ChunkStringEvaluator.java | 209 +++++++++++++++++ .../xpack/esql/action/EsqlCapabilities.java | 7 +- .../esql/expression/ExpressionWritables.java | 2 + .../function/EsqlFunctionRegistry.java | 4 +- .../function/scalar/string/Chunk.java | 213 ++++++++++++++++++ 9 files changed, 482 insertions(+), 3 deletions(-) create mode 100644 x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec create mode 100644 x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkStringEvaluator.java create mode 100644 x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java diff --git a/x-pack/plugin/esql/build.gradle b/x-pack/plugin/esql/build.gradle index 00a247f854e50..21369b1813c5b 100644 --- a/x-pack/plugin/esql/build.gradle +++ b/x-pack/plugin/esql/build.gradle @@ -45,6 +45,7 @@ dependencies { api "org.apache.lucene:lucene-spatial3d:${versions.lucene}" api project(":libs:h3") implementation project('arrow') + implementation project(xpackModule('inference')) // Also contains a dummy processor to allow compilation with unused annotations. annotationProcessor project('compute:gen') @@ -58,6 +59,7 @@ dependencies { testImplementation project(path: xpackModule('spatial')) testImplementation project(path: xpackModule('kql')) testImplementation project(path: xpackModule('mapper-unsigned-long')) + testImplementation project(path: xpackModule('inference')) testImplementation project(path: ':modules:reindex') testImplementation project(path: ':modules:parent-join') diff --git a/x-pack/plugin/esql/compute/build.gradle b/x-pack/plugin/esql/compute/build.gradle index 8acb7697b9f15..d281d64be7853 100644 --- a/x-pack/plugin/esql/compute/build.gradle +++ b/x-pack/plugin/esql/compute/build.gradle @@ -16,6 +16,7 @@ dependencies { compileOnly project(xpackModule('ml')) annotationProcessor project('gen') implementation 'com.carrotsearch:hppc:0.8.1' + implementation project(xpackModule('inference')) testImplementation(project(':modules:analysis-common')) testImplementation(project(':test:framework')) @@ -30,6 +31,7 @@ dependencies { } testImplementation(project(xpackModule('core'))) testImplementation(project(xpackModule('ml'))) + testImplementation(project(xpackModule('inference'))) } def projectDirectory = project.layout.projectDirectory diff --git a/x-pack/plugin/esql/compute/src/main/java/module-info.java b/x-pack/plugin/esql/compute/src/main/java/module-info.java index f21ed72d7eb21..6a2d2af9b7e6f 100644 --- a/x-pack/plugin/esql/compute/src/main/java/module-info.java +++ b/x-pack/plugin/esql/compute/src/main/java/module-info.java @@ -20,7 +20,8 @@ requires org.elasticsearch.tdigest; requires org.elasticsearch.geo; requires org.elasticsearch.xcore; - requires hppc; + requires hppc; + requires org.elasticsearch.inference; exports org.elasticsearch.compute; exports org.elasticsearch.compute.aggregation; diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec new file mode 100644 index 0000000000000..d5cf5064e8e5a --- /dev/null +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -0,0 +1,43 @@ +############################################### +# Tests for Chunk function +# + +chunkExample +required_capability: chunk_function + +// tag::chunk-with-field[] +FROM books +| EVAL chunks = CHUNK(description, 1, 20) +// end::chunk-with-field[] +| KEEP book_no, title, chunks +| SORT book_no +| LIMIT 5 +; + +// tag::chunk-with-field-result[] +book_no:keyword | title:text | chunks:keyword +1211 | The brothers Karamazov | null +1463 | Realms of Tolkien: Images of Middle-earth | null +1502 | Selected Passages from Correspondence with Friends | null +1937 | The Best Short Stories of Dostoevsky (Modern Library) | null +1985 | Brothers Karamazov | null +// end::chunk-with-field-result[] +; + + +chunkTextWithMatch +required_capability: chunk_function + +FROM books +| WHERE MATCH(title, "Return") +| EVAL chunks = CHUNK(description, 1, 20) +| KEEP book_no, title, chunks; +ignoreOrder:true + +book_no:keyword | title:text | chunks:keyword +2714 | Return of the King Being the Third Part of The Lord of the Rings | null +7350 | Return of the Shadow | null +; + + + diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkStringEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkStringEvaluator.java new file mode 100644 index 0000000000000..f4d91991f19ca --- /dev/null +++ b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkStringEvaluator.java @@ -0,0 +1,209 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License +// 2.0; you may not use this file except in compliance with the Elastic License +// 2.0. +package org.elasticsearch.xpack.esql.expression.function.scalar.string; + +import java.lang.IllegalArgumentException; +import java.lang.Override; +import java.lang.String; +import java.util.function.Function; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.elasticsearch.compute.data.Block; +import org.elasticsearch.compute.data.BytesRefBlock; +import org.elasticsearch.compute.data.BytesRefVector; +import org.elasticsearch.compute.data.IntBlock; +import org.elasticsearch.compute.data.IntVector; +import org.elasticsearch.compute.data.Page; +import org.elasticsearch.compute.operator.BreakingBytesRefBuilder; +import org.elasticsearch.compute.operator.DriverContext; +import org.elasticsearch.compute.operator.EvalOperator; +import org.elasticsearch.compute.operator.Warnings; +import org.elasticsearch.core.Releasables; +import org.elasticsearch.xpack.esql.core.tree.Source; + +/** + * {@link EvalOperator.ExpressionEvaluator} implementation for {@link Chunk}. + * This class is generated. Edit {@code EvaluatorImplementer} instead. + */ +public final class ChunkStringEvaluator implements EvalOperator.ExpressionEvaluator { + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(ChunkStringEvaluator.class); + + private final Source source; + + private final BreakingBytesRefBuilder scratch; + + private final EvalOperator.ExpressionEvaluator str; + + private final EvalOperator.ExpressionEvaluator numChunks; + + private final EvalOperator.ExpressionEvaluator chunkSize; + + private final DriverContext driverContext; + + private Warnings warnings; + + public ChunkStringEvaluator(Source source, BreakingBytesRefBuilder scratch, + EvalOperator.ExpressionEvaluator str, EvalOperator.ExpressionEvaluator numChunks, + EvalOperator.ExpressionEvaluator chunkSize, DriverContext driverContext) { + this.source = source; + this.scratch = scratch; + this.str = str; + this.numChunks = numChunks; + this.chunkSize = chunkSize; + this.driverContext = driverContext; + } + + @Override + public Block eval(Page page) { + try (BytesRefBlock strBlock = (BytesRefBlock) str.eval(page)) { + try (IntBlock numChunksBlock = (IntBlock) numChunks.eval(page)) { + try (IntBlock chunkSizeBlock = (IntBlock) chunkSize.eval(page)) { + BytesRefVector strVector = strBlock.asVector(); + if (strVector == null) { + return eval(page.getPositionCount(), strBlock, numChunksBlock, chunkSizeBlock); + } + IntVector numChunksVector = numChunksBlock.asVector(); + if (numChunksVector == null) { + return eval(page.getPositionCount(), strBlock, numChunksBlock, chunkSizeBlock); + } + IntVector chunkSizeVector = chunkSizeBlock.asVector(); + if (chunkSizeVector == null) { + return eval(page.getPositionCount(), strBlock, numChunksBlock, chunkSizeBlock); + } + return eval(page.getPositionCount(), strVector, numChunksVector, chunkSizeVector); + } + } + } + } + + @Override + public long baseRamBytesUsed() { + long baseRamBytesUsed = BASE_RAM_BYTES_USED; + baseRamBytesUsed += str.baseRamBytesUsed(); + baseRamBytesUsed += numChunks.baseRamBytesUsed(); + baseRamBytesUsed += chunkSize.baseRamBytesUsed(); + return baseRamBytesUsed; + } + + public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock numChunksBlock, + IntBlock chunkSizeBlock) { + try(BytesRefBlock.Builder result = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { + BytesRef strScratch = new BytesRef(); + position: for (int p = 0; p < positionCount; p++) { + if (strBlock.isNull(p)) { + result.appendNull(); + continue position; + } + if (strBlock.getValueCount(p) != 1) { + if (strBlock.getValueCount(p) > 1) { + warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); + } + result.appendNull(); + continue position; + } + if (numChunksBlock.isNull(p)) { + result.appendNull(); + continue position; + } + if (numChunksBlock.getValueCount(p) != 1) { + if (numChunksBlock.getValueCount(p) > 1) { + warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); + } + result.appendNull(); + continue position; + } + if (chunkSizeBlock.isNull(p)) { + result.appendNull(); + continue position; + } + if (chunkSizeBlock.getValueCount(p) != 1) { + if (chunkSizeBlock.getValueCount(p) > 1) { + warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); + } + result.appendNull(); + continue position; + } + try { + result.appendBytesRef(Chunk.process(this.scratch, strBlock.getBytesRef(strBlock.getFirstValueIndex(p), strScratch), numChunksBlock.getInt(numChunksBlock.getFirstValueIndex(p)), chunkSizeBlock.getInt(chunkSizeBlock.getFirstValueIndex(p)))); + } catch (IllegalArgumentException e) { + warnings().registerException(e); + result.appendNull(); + } + } + return result.build(); + } + } + + public BytesRefBlock eval(int positionCount, BytesRefVector strVector, IntVector numChunksVector, + IntVector chunkSizeVector) { + try(BytesRefBlock.Builder result = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { + BytesRef strScratch = new BytesRef(); + position: for (int p = 0; p < positionCount; p++) { + try { + result.appendBytesRef(Chunk.process(this.scratch, strVector.getBytesRef(p, strScratch), numChunksVector.getInt(p), chunkSizeVector.getInt(p))); + } catch (IllegalArgumentException e) { + warnings().registerException(e); + result.appendNull(); + } + } + return result.build(); + } + } + + @Override + public String toString() { + return "ChunkStringEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkSize=" + chunkSize + "]"; + } + + @Override + public void close() { + Releasables.closeExpectNoException(scratch, str, numChunks, chunkSize); + } + + private Warnings warnings() { + if (warnings == null) { + this.warnings = Warnings.createWarnings( + driverContext.warningsMode(), + source.source().getLineNumber(), + source.source().getColumnNumber(), + source.text() + ); + } + return warnings; + } + + static class Factory implements EvalOperator.ExpressionEvaluator.Factory { + private final Source source; + + private final Function scratch; + + private final EvalOperator.ExpressionEvaluator.Factory str; + + private final EvalOperator.ExpressionEvaluator.Factory numChunks; + + private final EvalOperator.ExpressionEvaluator.Factory chunkSize; + + public Factory(Source source, Function scratch, + EvalOperator.ExpressionEvaluator.Factory str, + EvalOperator.ExpressionEvaluator.Factory numChunks, + EvalOperator.ExpressionEvaluator.Factory chunkSize) { + this.source = source; + this.scratch = scratch; + this.str = str; + this.numChunks = numChunks; + this.chunkSize = chunkSize; + } + + @Override + public ChunkStringEvaluator get(DriverContext context) { + return new ChunkStringEvaluator(source, scratch.apply(context), str.get(context), numChunks.get(context), chunkSize.get(context), context); + } + + @Override + public String toString() { + return "ChunkStringEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkSize=" + chunkSize + "]"; + } + } +} diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 478fb5af2676e..55fba1911f2b0 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -1421,7 +1421,12 @@ public enum Cap { /** * URL decoding function. */ - URL_DECODE(Build.current().isSnapshot()); + URL_DECODE(Build.current().isSnapshot()), + + /** + * Chunk function. + */ + CHUNK_FUNCTION(Build.current().isSnapshot()); private final boolean enabled; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/ExpressionWritables.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/ExpressionWritables.java index 20de89a53780d..4eb99234407f3 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/ExpressionWritables.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/ExpressionWritables.java @@ -74,6 +74,7 @@ import org.elasticsearch.xpack.esql.expression.function.scalar.spatial.StYMax; import org.elasticsearch.xpack.esql.expression.function.scalar.spatial.StYMin; import org.elasticsearch.xpack.esql.expression.function.scalar.string.ByteLength; +import org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk; import org.elasticsearch.xpack.esql.expression.function.scalar.string.LTrim; import org.elasticsearch.xpack.esql.expression.function.scalar.string.Length; import org.elasticsearch.xpack.esql.expression.function.scalar.string.RTrim; @@ -227,6 +228,7 @@ public static List unaryScalars() { entries.add(Delay.ENTRY); entries.add(UrlEncode.ENTRY); entries.add(UrlDecode.ENTRY); + entries.add(Chunk.ENTRY); // mv functions entries.addAll(MvFunctionWritables.getNamedWriteables()); return entries; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java index 9b794d9b9b7b5..bf8acc7d18e98 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java @@ -162,6 +162,7 @@ import org.elasticsearch.xpack.esql.expression.function.scalar.spatial.StYMin; import org.elasticsearch.xpack.esql.expression.function.scalar.string.BitLength; import org.elasticsearch.xpack.esql.expression.function.scalar.string.ByteLength; +import org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk; import org.elasticsearch.xpack.esql.expression.function.scalar.string.Concat; import org.elasticsearch.xpack.esql.expression.function.scalar.string.Contains; import org.elasticsearch.xpack.esql.expression.function.scalar.string.EndsWith; @@ -519,7 +520,8 @@ private static FunctionDefinition[][] snapshotFunctions() { def(Magnitude.class, Magnitude::new, "v_magnitude"), def(Hamming.class, Hamming::new, "v_hamming"), def(UrlEncode.class, UrlEncode::new, "url_encode"), - def(UrlDecode.class, UrlDecode::new, "url_decode") } }; + def(UrlDecode.class, UrlDecode::new, "url_decode"), + def(Chunk.class, tri(Chunk::new), "chunk") } }; } public EsqlFunctionRegistry snapshotRegistry() { diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java new file mode 100644 index 0000000000000..825f9fd1385b4 --- /dev/null +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -0,0 +1,213 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function.scalar.string; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.compute.ann.Evaluator; +import org.elasticsearch.compute.ann.Fixed; +import org.elasticsearch.compute.operator.BreakingBytesRefBuilder; +import org.elasticsearch.compute.operator.EvalOperator; +import org.elasticsearch.inference.ChunkingSettings; +import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.tree.NodeInfo; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.esql.expression.function.Example; +import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; +import org.elasticsearch.xpack.esql.expression.function.TwoOptionalArguments; +import org.elasticsearch.xpack.esql.expression.function.Param; +import org.elasticsearch.xpack.esql.expression.function.scalar.EsqlScalarFunction; +import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; +import org.elasticsearch.xpack.inference.chunking.Chunker; +import org.elasticsearch.xpack.inference.chunking.ChunkerBuilder; +import org.elasticsearch.xpack.inference.chunking.SentenceBoundaryChunkingSettings; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; + +import static org.elasticsearch.compute.ann.Fixed.Scope.THREAD_LOCAL; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; + +public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { + + public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Chunk", Chunk::new); + + private static final int DEFAULT_NUM_CHUNKS = 1; + private static final int DEFAULT_CHUNK_SIZE = 300; + + private final Expression field, numChunks, chunkSize; + + @FunctionInfo( + returnType = "keyword", + preview = true, + description = """ + Chunks the contents of a field.""", + examples = { @Example(file = "chunk-function", tag = "chunk-with-field", applies_to = "stack: preview 9.2.0") } + ) + public Chunk( + Source source, + @Param(name = "field", type = { "keyword", "text" }, description = "The input to chunk.") Expression field, + @Param( + optional = true, + name = "num_chunks", + type = { "integer" }, + description = "The number of chunks to return. Defaults to " + DEFAULT_NUM_CHUNKS + ) Expression numChunks, + @Param( + optional = true, + name = "chunk_size", + type = { "integer" }, + description = "The size of sentence-based chunks to use. Defaults to " + DEFAULT_CHUNK_SIZE + ) Expression chunkSize + ) { + super(source, fields(field, numChunks, chunkSize)); + this.field = field; + this.numChunks = numChunks; + this.chunkSize = chunkSize; + } + + public Chunk(StreamInput in) throws IOException { + this( + Source.readFrom((PlanStreamInput) in), + in.readNamedWriteable(Expression.class), + in.readOptionalNamedWriteable(Expression.class), + in.readOptionalNamedWriteable(Expression.class) + ); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + source().writeTo(out); + out.writeNamedWriteable(field); + out.writeOptionalNamedWriteable(numChunks); + out.writeOptionalNamedWriteable(chunkSize); + } + + @Override + public String getWriteableName() { + return ENTRY.name; + } + + @Override + public DataType dataType() { + return field.dataType().noText(); + } + + @Override + protected TypeResolution resolveType() { + if (childrenResolved() == false) { + return new TypeResolution("Unresolved children"); + } + + return isString(field(), sourceText(), FIRST); + } + + @Override + public boolean foldable() { + return field().foldable() && (numChunks() == null || numChunks().foldable()) && (chunkSize() == null || chunkSize().foldable()); + } + + @Override + public Expression replaceChildren(List newChildren) { + return new Chunk( + source(), + newChildren.get(0), // field + numChunks == null ? null : newChildren.get(1), + chunkSize == null ? null : newChildren.get(2) + ); + } + + @Override + protected NodeInfo info() { + return NodeInfo.create(this, Chunk::new, field, numChunks, chunkSize); + } + + Expression field() { + return field; + } + + Expression numChunks() { + return numChunks; + } + + Expression chunkSize() { + return chunkSize; + } + + @Evaluator(extraName = "String", warnExceptions = IllegalArgumentException.class) + static BytesRef process( + @Fixed(includeInToString = false, scope = THREAD_LOCAL) BreakingBytesRefBuilder scratch, + BytesRef str, + int numChunks, + int chunkSize + ) { + String content = str.utf8ToString(); + + ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + Chunker chunker = ChunkerBuilder.fromChunkingStrategy(settings.getChunkingStrategy()); + + List chunks = chunker.chunk(content, settings) + .stream() + .map(offset -> content.substring(offset.start(), offset.end())) + .limit(numChunks) + .toList(); + + int totalBytes = chunks.stream().mapToInt(chunk -> chunk.getBytes().length).sum(); + scratch.grow(totalBytes); + scratch.clear(); + + for (String chunk : chunks) { + scratch.append(new BytesRef(chunk)); + } + + return scratch.bytesRefView(); + } + + @Override + public boolean equals(Object o) { + if (o == null || getClass() != o.getClass()) return false; + Chunk chunk = (Chunk) o; + return Objects.equals(field(), chunk.field()) + && Objects.equals(numChunks(), chunk.numChunks()) + && Objects.equals(chunkSize(), chunk.chunkSize()); + } + + @Override + public int hashCode() { + return Objects.hash(field(), numChunks(), chunkSize()); + } + + private static List fields(Expression field, Expression numChunks, Expression chunkSize) { + List list = new ArrayList<>(4); + list.add(field); + if (numChunks != null) { + list.add(numChunks); + } + if (chunkSize != null) { + list.add(chunkSize); + } + return list; + } + + @Override + public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { + return new ChunkStringEvaluator.Factory( + source(), + context -> new BreakingBytesRefBuilder(context.breaker(), "chunk"), + toEvaluator.apply(field), + toEvaluator.apply(numChunks), + toEvaluator.apply(chunkSize) + ); + } +} From 6ae1cdcddef69f45401e7e2bde649c8103096fde Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Mon, 8 Sep 2025 13:47:13 -0400 Subject: [PATCH 02/29] Refactor CHUNK function to support multiple values --- ...uator.java => ChunkBytesRefEvaluator.java} | 35 +++++-------- .../function/EsqlFunctionRegistry.java | 15 ++++-- .../function/scalar/string/Chunk.java | 49 +++++++++---------- 3 files changed, 48 insertions(+), 51 deletions(-) rename x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/{ChunkStringEvaluator.java => ChunkBytesRefEvaluator.java} (79%) diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkStringEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java similarity index 79% rename from x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkStringEvaluator.java rename to x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java index f4d91991f19ca..ce499b18f1c5d 100644 --- a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkStringEvaluator.java +++ b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java @@ -7,7 +7,6 @@ import java.lang.IllegalArgumentException; import java.lang.Override; import java.lang.String; -import java.util.function.Function; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.RamUsageEstimator; import org.elasticsearch.compute.data.Block; @@ -16,7 +15,6 @@ import org.elasticsearch.compute.data.IntBlock; import org.elasticsearch.compute.data.IntVector; import org.elasticsearch.compute.data.Page; -import org.elasticsearch.compute.operator.BreakingBytesRefBuilder; import org.elasticsearch.compute.operator.DriverContext; import org.elasticsearch.compute.operator.EvalOperator; import org.elasticsearch.compute.operator.Warnings; @@ -27,13 +25,11 @@ * {@link EvalOperator.ExpressionEvaluator} implementation for {@link Chunk}. * This class is generated. Edit {@code EvaluatorImplementer} instead. */ -public final class ChunkStringEvaluator implements EvalOperator.ExpressionEvaluator { - private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(ChunkStringEvaluator.class); +public final class ChunkBytesRefEvaluator implements EvalOperator.ExpressionEvaluator { + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(ChunkBytesRefEvaluator.class); private final Source source; - private final BreakingBytesRefBuilder scratch; - private final EvalOperator.ExpressionEvaluator str; private final EvalOperator.ExpressionEvaluator numChunks; @@ -44,11 +40,10 @@ public final class ChunkStringEvaluator implements EvalOperator.ExpressionEvalua private Warnings warnings; - public ChunkStringEvaluator(Source source, BreakingBytesRefBuilder scratch, - EvalOperator.ExpressionEvaluator str, EvalOperator.ExpressionEvaluator numChunks, - EvalOperator.ExpressionEvaluator chunkSize, DriverContext driverContext) { + public ChunkBytesRefEvaluator(Source source, EvalOperator.ExpressionEvaluator str, + EvalOperator.ExpressionEvaluator numChunks, EvalOperator.ExpressionEvaluator chunkSize, + DriverContext driverContext) { this.source = source; - this.scratch = scratch; this.str = str; this.numChunks = numChunks; this.chunkSize = chunkSize; @@ -126,7 +121,7 @@ public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock nu continue position; } try { - result.appendBytesRef(Chunk.process(this.scratch, strBlock.getBytesRef(strBlock.getFirstValueIndex(p), strScratch), numChunksBlock.getInt(numChunksBlock.getFirstValueIndex(p)), chunkSizeBlock.getInt(chunkSizeBlock.getFirstValueIndex(p)))); + Chunk.process(result, strBlock.getBytesRef(strBlock.getFirstValueIndex(p), strScratch), numChunksBlock.getInt(numChunksBlock.getFirstValueIndex(p)), chunkSizeBlock.getInt(chunkSizeBlock.getFirstValueIndex(p))); } catch (IllegalArgumentException e) { warnings().registerException(e); result.appendNull(); @@ -142,7 +137,7 @@ public BytesRefBlock eval(int positionCount, BytesRefVector strVector, IntVector BytesRef strScratch = new BytesRef(); position: for (int p = 0; p < positionCount; p++) { try { - result.appendBytesRef(Chunk.process(this.scratch, strVector.getBytesRef(p, strScratch), numChunksVector.getInt(p), chunkSizeVector.getInt(p))); + Chunk.process(result, strVector.getBytesRef(p, strScratch), numChunksVector.getInt(p), chunkSizeVector.getInt(p)); } catch (IllegalArgumentException e) { warnings().registerException(e); result.appendNull(); @@ -154,12 +149,12 @@ public BytesRefBlock eval(int positionCount, BytesRefVector strVector, IntVector @Override public String toString() { - return "ChunkStringEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkSize=" + chunkSize + "]"; + return "ChunkBytesRefEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkSize=" + chunkSize + "]"; } @Override public void close() { - Releasables.closeExpectNoException(scratch, str, numChunks, chunkSize); + Releasables.closeExpectNoException(str, numChunks, chunkSize); } private Warnings warnings() { @@ -177,33 +172,29 @@ private Warnings warnings() { static class Factory implements EvalOperator.ExpressionEvaluator.Factory { private final Source source; - private final Function scratch; - private final EvalOperator.ExpressionEvaluator.Factory str; private final EvalOperator.ExpressionEvaluator.Factory numChunks; private final EvalOperator.ExpressionEvaluator.Factory chunkSize; - public Factory(Source source, Function scratch, - EvalOperator.ExpressionEvaluator.Factory str, + public Factory(Source source, EvalOperator.ExpressionEvaluator.Factory str, EvalOperator.ExpressionEvaluator.Factory numChunks, EvalOperator.ExpressionEvaluator.Factory chunkSize) { this.source = source; - this.scratch = scratch; this.str = str; this.numChunks = numChunks; this.chunkSize = chunkSize; } @Override - public ChunkStringEvaluator get(DriverContext context) { - return new ChunkStringEvaluator(source, scratch.apply(context), str.get(context), numChunks.get(context), chunkSize.get(context), context); + public ChunkBytesRefEvaluator get(DriverContext context) { + return new ChunkBytesRefEvaluator(source, str.get(context), numChunks.get(context), chunkSize.get(context), context); } @Override public String toString() { - return "ChunkStringEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkSize=" + chunkSize + "]"; + return "ChunkBytesRefEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkSize=" + chunkSize + "]"; } } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java index bf8acc7d18e98..ae3df59236b1c 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java @@ -1005,13 +1005,22 @@ public interface BinaryBuilder { @SuppressWarnings("overloads") // These are ambiguous if you aren't using ctor references but we always do protected static FunctionDefinition def(Class function, TernaryBuilder ctorRef, String... names) { FunctionBuilder builder = (source, children, cfg) -> { + boolean hasMinimumOne = TwoOptionalArguments.class.isAssignableFrom(function); boolean hasMinimumTwo = OptionalArgument.class.isAssignableFrom(function); - if (hasMinimumTwo && (children.size() > 3 || children.size() < 2)) { + if (hasMinimumOne && (children.size() > 3 || children.isEmpty())) { + throw new QlIllegalArgumentException("expects minimum one, maximum three arguments"); + } else if (hasMinimumTwo && (children.size() > 3 || children.size() < 2)) { throw new QlIllegalArgumentException("expects two or three arguments"); - } else if (hasMinimumTwo == false && children.size() != 3) { + } else if (hasMinimumOne == false && hasMinimumTwo == false && children.size() != 3) { throw new QlIllegalArgumentException("expects exactly three arguments"); } - return ctorRef.build(source, children.get(0), children.get(1), children.size() == 3 ? children.get(2) : null); + + return ctorRef.build( + source, + children.get(0), + children.size() > 1 ? children.get(1) : null, + children.size() == 3 ? children.get(2) : null + ); }; return def(function, builder, names); } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 825f9fd1385b4..094dfcf0e7fa4 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -12,11 +12,11 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.compute.ann.Evaluator; -import org.elasticsearch.compute.ann.Fixed; -import org.elasticsearch.compute.operator.BreakingBytesRefBuilder; +import org.elasticsearch.compute.data.BytesRefBlock; import org.elasticsearch.compute.operator.EvalOperator; import org.elasticsearch.inference.ChunkingSettings; import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.Literal; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; @@ -35,7 +35,6 @@ import java.util.List; import java.util.Objects; -import static org.elasticsearch.compute.ann.Fixed.Scope.THREAD_LOCAL; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; @@ -145,41 +144,38 @@ Expression chunkSize() { return chunkSize; } - @Evaluator(extraName = "String", warnExceptions = IllegalArgumentException.class) - static BytesRef process( - @Fixed(includeInToString = false, scope = THREAD_LOCAL) BreakingBytesRefBuilder scratch, - BytesRef str, - int numChunks, - int chunkSize - ) { + @Evaluator(extraName = "BytesRef", warnExceptions = IllegalArgumentException.class) + static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, int chunkSize) { String content = str.utf8ToString(); - + ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSize, 0); Chunker chunker = ChunkerBuilder.fromChunkingStrategy(settings.getChunkingStrategy()); - + List chunks = chunker.chunk(content, settings) .stream() .map(offset -> content.substring(offset.start(), offset.end())) .limit(numChunks) .toList(); - - int totalBytes = chunks.stream().mapToInt(chunk -> chunk.getBytes().length).sum(); - scratch.grow(totalBytes); - scratch.clear(); - + + boolean multivalued = chunks.size() > 1; + if (multivalued) { + builder.beginPositionEntry(); + } for (String chunk : chunks) { - scratch.append(new BytesRef(chunk)); + builder.appendBytesRef(new BytesRef(chunk)); + } + + if (multivalued) { + builder.endPositionEntry(); } - - return scratch.bytesRefView(); } @Override public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; Chunk chunk = (Chunk) o; - return Objects.equals(field(), chunk.field()) - && Objects.equals(numChunks(), chunk.numChunks()) + return Objects.equals(field(), chunk.field()) + && Objects.equals(numChunks(), chunk.numChunks()) && Objects.equals(chunkSize(), chunk.chunkSize()); } @@ -202,12 +198,13 @@ private static List fields(Expression field, Expression numChunks, E @Override public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { - return new ChunkStringEvaluator.Factory( + return new ChunkBytesRefEvaluator.Factory( source(), - context -> new BreakingBytesRefBuilder(context.breaker(), "chunk"), toEvaluator.apply(field), - toEvaluator.apply(numChunks), - toEvaluator.apply(chunkSize) + numChunks != null ? toEvaluator.apply(numChunks) + : toEvaluator.apply(new Literal(source(), DEFAULT_NUM_CHUNKS, DataType.INTEGER)), + chunkSize != null ? toEvaluator.apply(chunkSize) + : toEvaluator.apply(new Literal(source(), DEFAULT_CHUNK_SIZE, DataType.INTEGER)) ); } } From 1f4342c95eef530d5efe393257745aa3e86e4f68 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Mon, 8 Sep 2025 14:32:56 -0400 Subject: [PATCH 03/29] Default to returning all chunks --- .../expression/function/scalar/string/Chunk.java | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 094dfcf0e7fa4..2bd91d529e8db 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -42,7 +42,7 @@ public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Chunk", Chunk::new); - private static final int DEFAULT_NUM_CHUNKS = 1; + private static final int DEFAULT_NUM_CHUNKS = -1; private static final int DEFAULT_CHUNK_SIZE = 300; private final Expression field, numChunks, chunkSize; @@ -61,7 +61,7 @@ public Chunk( optional = true, name = "num_chunks", type = { "integer" }, - description = "The number of chunks to return. Defaults to " + DEFAULT_NUM_CHUNKS + description = "The number of chunks to return. Defaults to return all chunks." ) Expression numChunks, @Param( optional = true, @@ -154,7 +154,7 @@ static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, List chunks = chunker.chunk(content, settings) .stream() .map(offset -> content.substring(offset.start(), offset.end())) - .limit(numChunks) + .limit(numChunks > 0 ? numChunks : Long.MAX_VALUE) .toList(); boolean multivalued = chunks.size() > 1; @@ -201,9 +201,11 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua return new ChunkBytesRefEvaluator.Factory( source(), toEvaluator.apply(field), - numChunks != null ? toEvaluator.apply(numChunks) + numChunks != null + ? toEvaluator.apply(numChunks) : toEvaluator.apply(new Literal(source(), DEFAULT_NUM_CHUNKS, DataType.INTEGER)), - chunkSize != null ? toEvaluator.apply(chunkSize) + chunkSize != null + ? toEvaluator.apply(chunkSize) : toEvaluator.apply(new Literal(source(), DEFAULT_CHUNK_SIZE, DataType.INTEGER)) ); } From 528c12c41a05778c494832825a62f6f5817e73b0 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Mon, 8 Sep 2025 18:44:26 +0000 Subject: [PATCH 04/29] [CI] Auto commit changes from spotless --- x-pack/plugin/esql/compute/src/main/java/module-info.java | 2 +- .../xpack/esql/expression/function/scalar/string/Chunk.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/esql/compute/src/main/java/module-info.java b/x-pack/plugin/esql/compute/src/main/java/module-info.java index 6a2d2af9b7e6f..557c4ed35b05a 100644 --- a/x-pack/plugin/esql/compute/src/main/java/module-info.java +++ b/x-pack/plugin/esql/compute/src/main/java/module-info.java @@ -20,7 +20,7 @@ requires org.elasticsearch.tdigest; requires org.elasticsearch.geo; requires org.elasticsearch.xcore; - requires hppc; + requires hppc; requires org.elasticsearch.inference; exports org.elasticsearch.compute; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 2bd91d529e8db..1d0c1c87171d0 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -22,8 +22,8 @@ import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.expression.function.Example; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; -import org.elasticsearch.xpack.esql.expression.function.TwoOptionalArguments; import org.elasticsearch.xpack.esql.expression.function.Param; +import org.elasticsearch.xpack.esql.expression.function.TwoOptionalArguments; import org.elasticsearch.xpack.esql.expression.function.scalar.EsqlScalarFunction; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import org.elasticsearch.xpack.inference.chunking.Chunker; From 04307f271fe7cf80a19a2b89e05dd3bfd0bd8bfc Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Thu, 11 Sep 2025 13:43:05 -0400 Subject: [PATCH 05/29] Handle warnings --- .../esql/qa/testFixtures/src/main/resources/chunk.csv-spec | 2 ++ 1 file changed, 2 insertions(+) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index d5cf5064e8e5a..e172735b927ce 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -13,6 +13,7 @@ FROM books | SORT book_no | LIMIT 5 ; +warningRegex:java.lang.IllegalArgumentException: single-value function encountered multi-value // tag::chunk-with-field-result[] book_no:keyword | title:text | chunks:keyword @@ -33,6 +34,7 @@ FROM books | EVAL chunks = CHUNK(description, 1, 20) | KEEP book_no, title, chunks; ignoreOrder:true +warningRegex:java.lang.IllegalArgumentException: single-value function encountered multi-value book_no:keyword | title:text | chunks:keyword 2714 | Return of the King Being the Third Part of The Lord of the Rings | null From 66a13bb658eba8f77bbe7fd31fc1be89ff4f5ff8 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Wed, 17 Sep 2025 15:18:26 -0400 Subject: [PATCH 06/29] Loosen export restrictions to try to get compile error working --- server/src/main/java/module-info.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 549c603b13980..7d94ae88d520b 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -481,7 +481,7 @@ exports org.elasticsearch.inference.configuration; exports org.elasticsearch.inference.validation; exports org.elasticsearch.monitor.metrics; - exports org.elasticsearch.plugins.internal.rewriter to org.elasticsearch.inference; + exports org.elasticsearch.plugins.internal.rewriter; exports org.elasticsearch.lucene.util.automaton; exports org.elasticsearch.index.codec.perfield; exports org.elasticsearch.index.codec.vectors to org.elasticsearch.test.knn; From 693ea01ce6110f641a0565536c76c7c94337c693 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Mon, 13 Oct 2025 14:51:53 -0400 Subject: [PATCH 07/29] Remove inference dependencies --- x-pack/plugin/esql/compute/build.gradle | 2 -- x-pack/plugin/esql/compute/src/main/java/module-info.java | 1 - 2 files changed, 3 deletions(-) diff --git a/x-pack/plugin/esql/compute/build.gradle b/x-pack/plugin/esql/compute/build.gradle index c3268dc57865b..bd4bb33873be5 100644 --- a/x-pack/plugin/esql/compute/build.gradle +++ b/x-pack/plugin/esql/compute/build.gradle @@ -16,7 +16,6 @@ dependencies { compileOnly project(xpackModule('ml')) annotationProcessor project('gen') implementation 'com.carrotsearch:hppc:0.8.1' - implementation project(xpackModule('inference')) testImplementation(project(':modules:analysis-common')) testImplementation(project(':test:framework')) @@ -31,7 +30,6 @@ dependencies { } testImplementation(project(xpackModule('core'))) testImplementation(project(xpackModule('ml'))) - testImplementation(project(xpackModule('inference'))) } def projectDirectory = project.layout.projectDirectory diff --git a/x-pack/plugin/esql/compute/src/main/java/module-info.java b/x-pack/plugin/esql/compute/src/main/java/module-info.java index 059392bf3c19a..eef946614ae42 100644 --- a/x-pack/plugin/esql/compute/src/main/java/module-info.java +++ b/x-pack/plugin/esql/compute/src/main/java/module-info.java @@ -21,7 +21,6 @@ requires org.elasticsearch.geo; requires org.elasticsearch.xcore; requires hppc; - requires org.elasticsearch.inference; exports org.elasticsearch.compute; exports org.elasticsearch.compute.aggregation; From fde03685a463f8caea848c6926246211c27c1289 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Mon, 13 Oct 2025 14:54:01 -0400 Subject: [PATCH 08/29] Fix compilation errors --- .../scalar/string/ChunkBytesRefEvaluator.java | 70 ++++++++++--------- .../function/scalar/string/Chunk.java | 2 +- 2 files changed, 39 insertions(+), 33 deletions(-) diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java index ce499b18f1c5d..6cf9942393e90 100644 --- a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java +++ b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java @@ -87,41 +87,44 @@ public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock nu try(BytesRefBlock.Builder result = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { BytesRef strScratch = new BytesRef(); position: for (int p = 0; p < positionCount; p++) { - if (strBlock.isNull(p)) { - result.appendNull(); - continue position; - } - if (strBlock.getValueCount(p) != 1) { - if (strBlock.getValueCount(p) > 1) { - warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); - } - result.appendNull(); - continue position; - } - if (numChunksBlock.isNull(p)) { - result.appendNull(); - continue position; + switch (strBlock.getValueCount(p)) { + case 0: + result.appendNull(); + continue position; + case 1: + break; + default: + warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); + result.appendNull(); + continue position; } - if (numChunksBlock.getValueCount(p) != 1) { - if (numChunksBlock.getValueCount(p) > 1) { - warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); - } - result.appendNull(); - continue position; - } - if (chunkSizeBlock.isNull(p)) { - result.appendNull(); - continue position; + switch (numChunksBlock.getValueCount(p)) { + case 0: + result.appendNull(); + continue position; + case 1: + break; + default: + warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); + result.appendNull(); + continue position; } - if (chunkSizeBlock.getValueCount(p) != 1) { - if (chunkSizeBlock.getValueCount(p) > 1) { - warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); - } - result.appendNull(); - continue position; + switch (chunkSizeBlock.getValueCount(p)) { + case 0: + result.appendNull(); + continue position; + case 1: + break; + default: + warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); + result.appendNull(); + continue position; } + BytesRef str = strBlock.getBytesRef(strBlock.getFirstValueIndex(p), strScratch); + int numChunks = numChunksBlock.getInt(numChunksBlock.getFirstValueIndex(p)); + int chunkSize = chunkSizeBlock.getInt(chunkSizeBlock.getFirstValueIndex(p)); try { - Chunk.process(result, strBlock.getBytesRef(strBlock.getFirstValueIndex(p), strScratch), numChunksBlock.getInt(numChunksBlock.getFirstValueIndex(p)), chunkSizeBlock.getInt(chunkSizeBlock.getFirstValueIndex(p))); + Chunk.process(result, str, numChunks, chunkSize); } catch (IllegalArgumentException e) { warnings().registerException(e); result.appendNull(); @@ -136,8 +139,11 @@ public BytesRefBlock eval(int positionCount, BytesRefVector strVector, IntVector try(BytesRefBlock.Builder result = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { BytesRef strScratch = new BytesRef(); position: for (int p = 0; p < positionCount; p++) { + BytesRef str = strVector.getBytesRef(p, strScratch); + int numChunks = numChunksVector.getInt(p); + int chunkSize = chunkSizeVector.getInt(p); try { - Chunk.process(result, strVector.getBytesRef(p, strScratch), numChunksVector.getInt(p), chunkSizeVector.getInt(p)); + Chunk.process(result, str, numChunks, chunkSize); } catch (IllegalArgumentException e) { warnings().registerException(e); result.appendNull(); diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 1d0c1c87171d0..318c66ae63141 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -15,6 +15,7 @@ import org.elasticsearch.compute.data.BytesRefBlock; import org.elasticsearch.compute.operator.EvalOperator; import org.elasticsearch.inference.ChunkingSettings; +import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Literal; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; @@ -28,7 +29,6 @@ import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import org.elasticsearch.xpack.inference.chunking.Chunker; import org.elasticsearch.xpack.inference.chunking.ChunkerBuilder; -import org.elasticsearch.xpack.inference.chunking.SentenceBoundaryChunkingSettings; import java.io.IOException; import java.util.ArrayList; From a70d5b1807ad1330e88f65ba297624ae91da96a8 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Mon, 13 Oct 2025 15:11:16 -0400 Subject: [PATCH 09/29] Remove more inference deps --- x-pack/plugin/esql/build.gradle | 2 -- 1 file changed, 2 deletions(-) diff --git a/x-pack/plugin/esql/build.gradle b/x-pack/plugin/esql/build.gradle index 5a21960290933..734c0b62eb729 100644 --- a/x-pack/plugin/esql/build.gradle +++ b/x-pack/plugin/esql/build.gradle @@ -45,7 +45,6 @@ dependencies { api "org.apache.lucene:lucene-spatial3d:${versions.lucene}" api project(":libs:h3") implementation project('arrow') - implementation project(xpackModule('inference')) // Also contains a dummy processor to allow compilation with unused annotations. annotationProcessor project('compute:gen') @@ -59,7 +58,6 @@ dependencies { testImplementation project(path: xpackModule('spatial')) testImplementation project(path: xpackModule('kql')) testImplementation project(path: xpackModule('mapper-unsigned-long')) - testImplementation project(path: xpackModule('inference')) testImplementation project(path: ':modules:reindex') testImplementation project(path: ':modules:parent-join') From de9ddae5aaf31fa51cc1dca89f9e94af74d96779 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Tue, 21 Oct 2025 13:44:09 -0400 Subject: [PATCH 10/29] Fix compile errors from merge --- .../xpack/esql/expression/function/scalar/string/Chunk.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 318c66ae63141..164fd4b3a9eb4 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -15,6 +15,8 @@ import org.elasticsearch.compute.data.BytesRefBlock; import org.elasticsearch.compute.operator.EvalOperator; import org.elasticsearch.inference.ChunkingSettings; +import org.elasticsearch.xpack.core.inference.chunking.Chunker; +import org.elasticsearch.xpack.core.inference.chunking.ChunkerBuilder; import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Literal; @@ -27,8 +29,6 @@ import org.elasticsearch.xpack.esql.expression.function.TwoOptionalArguments; import org.elasticsearch.xpack.esql.expression.function.scalar.EsqlScalarFunction; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; -import org.elasticsearch.xpack.inference.chunking.Chunker; -import org.elasticsearch.xpack.inference.chunking.ChunkerBuilder; import java.io.IOException; import java.util.ArrayList; From d302fdd26f1ad3bd27df8698b9b18e26947d1133 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Tue, 21 Oct 2025 14:02:50 -0400 Subject: [PATCH 11/29] Fix existing tests --- .../src/main/resources/chunk.csv-spec | 16 ++++++++-------- .../expression/function/scalar/string/Chunk.java | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index e172735b927ce..3bf1367f17bc0 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -17,11 +17,11 @@ warningRegex:java.lang.IllegalArgumentException: single-value function encounter // tag::chunk-with-field-result[] book_no:keyword | title:text | chunks:keyword -1211 | The brothers Karamazov | null -1463 | Realms of Tolkien: Images of Middle-earth | null -1502 | Selected Passages from Correspondence with Friends | null -1937 | The Best Short Stories of Dostoevsky (Modern Library) | null -1985 | Brothers Karamazov | null +1211 | The brothers Karamazov | In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life. +1463 | Realms of Tolkien: Images of Middle-earth | Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 +1502 | Selected Passages from Correspondence with Friends | Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank. +1937 | The Best Short Stories of Dostoevsky (Modern Library) | This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept +1985 | Brothers Karamazov | Four brothers reunite in their hometown in Russia. // end::chunk-with-field-result[] ; @@ -36,9 +36,9 @@ FROM books ignoreOrder:true warningRegex:java.lang.IllegalArgumentException: single-value function encountered multi-value -book_no:keyword | title:text | chunks:keyword -2714 | Return of the King Being the Third Part of The Lord of the Rings | null -7350 | Return of the Shadow | null +book_no:keyword | title:text | chunks:keyword +2714 | Return of the King Being the Third Part of The Lord of the Rings | Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of +7350 | Return of the Shadow | In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings. ; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 164fd4b3a9eb4..1bdce7b07e291 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -162,7 +162,7 @@ static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, builder.beginPositionEntry(); } for (String chunk : chunks) { - builder.appendBytesRef(new BytesRef(chunk)); + builder.appendBytesRef(new BytesRef(chunk.trim())); } if (multivalued) { From ec456c6118d80028e86169e6ac36666dce24f55b Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Tue, 21 Oct 2025 14:21:29 -0400 Subject: [PATCH 12/29] Exclude from CSV tests --- .../src/test/java/org/elasticsearch/xpack/esql/CsvTests.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java index f90547f57c0ff..f190a82d56fd9 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java @@ -344,6 +344,10 @@ public final void test() throws Throwable { "CSV tests cannot currently handle multi_match function that depends on Lucene", testCase.requiredCapabilities.contains(EsqlCapabilities.Cap.MULTI_MATCH_FUNCTION.capabilityName()) ); + assumeFalse( + "CSV tests cannot currently handle CHUNK function", + testCase.requiredCapabilities.contains(EsqlCapabilities.Cap.CHUNK_FUNCTION.capabilityName()) + ); if (Build.current().isSnapshot()) { assertThat( From abeb7258033a7475861a93279d5ce87187764416 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Tue, 21 Oct 2025 16:04:48 -0400 Subject: [PATCH 13/29] Add more tests --- .../_snippets/functions/description/chunk.md | 10 + .../_snippets/functions/examples/chunk.md | 22 +++ .../esql/_snippets/functions/layout/chunk.md | 23 +++ .../_snippets/functions/parameters/chunk.md | 13 ++ .../esql/_snippets/functions/types/chunk.md | 9 + .../esql/images/functions/chunk.svg | 1 + .../kibana/definition/functions/chunk.json | 61 ++++++ .../esql/kibana/docs/functions/chunk.md | 9 + .../src/main/resources/chunk.csv-spec | 55 +++++- .../scalar/string/ChunkBytesRefEvaluator.java | 14 +- .../function/scalar/string/Chunk.java | 42 ++-- .../AbstractScalarFunctionTestCase.java | 3 + .../function/scalar/string/ChunkTests.java | 182 ++++++++++++++++++ 13 files changed, 417 insertions(+), 27 deletions(-) create mode 100644 docs/reference/query-languages/esql/_snippets/functions/description/chunk.md create mode 100644 docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md create mode 100644 docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md create mode 100644 docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md create mode 100644 docs/reference/query-languages/esql/_snippets/functions/types/chunk.md create mode 100644 docs/reference/query-languages/esql/images/functions/chunk.svg create mode 100644 docs/reference/query-languages/esql/kibana/definition/functions/chunk.json create mode 100644 docs/reference/query-languages/esql/kibana/docs/functions/chunk.md create mode 100644 x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java diff --git a/docs/reference/query-languages/esql/_snippets/functions/description/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/description/chunk.md new file mode 100644 index 0000000000000..aceb2978d97f3 --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/description/chunk.md @@ -0,0 +1,10 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Description** + +Use `CHUNK` to split a text field into smaller chunks. + +Chunk can be used on fields from the text famiy like [text](/reference/elasticsearch/mapping-reference/text.md) and [semantic_text](/reference/elasticsearch/mapping-reference/semantic-text.md). + Chunk will split a text field into smaller chunks, using a sentence-based chunking strategy. + The number of chunks returned, and the length of the sentences used to create the chunks can be specified. + diff --git a/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md new file mode 100644 index 0000000000000..2cc836bcd7178 --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md @@ -0,0 +1,22 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Example** + +```{applies_to} +stack: preview 9.3.0 +``` + +```esql +FROM books +| EVAL chunks = CHUNK(description, 1, 20) +``` + +| book_no:keyword | title:text | chunks:keyword | +| --- | --- | --- | +| 1211 | The brothers Karamazov | In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life. | +| 1463 | Realms of Tolkien: Images of Middle-earth | Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 | +| 1502 | Selected Passages from Correspondence with Friends | Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank. | +| 1937 | The Best Short Stories of Dostoevsky (Modern Library) | This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept | +| 1985 | Brothers Karamazov | Four brothers reunite in their hometown in Russia. | + + diff --git a/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md new file mode 100644 index 0000000000000..9ab96985aa35a --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md @@ -0,0 +1,23 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +## `CHUNK` [esql-chunk] + +**Syntax** + +:::{image} ../../../images/functions/chunk.svg +:alt: Embedded +:class: text-center +::: + + +:::{include} ../parameters/chunk.md +::: + +:::{include} ../description/chunk.md +::: + +:::{include} ../types/chunk.md +::: + +:::{include} ../examples/chunk.md +::: diff --git a/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md new file mode 100644 index 0000000000000..755ab69ce2224 --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md @@ -0,0 +1,13 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Parameters** + +`field` +: The input to chunk. + +`num_chunks` +: The number of chunks to return. Defaults to return all chunks. + +`chunk_size` +: The size of sentence-based chunks to use. Defaults to 300 + diff --git a/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md new file mode 100644 index 0000000000000..cea4a8217c3b0 --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md @@ -0,0 +1,9 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Supported types** + +| field | num_chunks | chunk_size | result | +| --- | --- | --- | --- | +| keyword | integer | integer | keyword | +| text | integer | integer | keyword | + diff --git a/docs/reference/query-languages/esql/images/functions/chunk.svg b/docs/reference/query-languages/esql/images/functions/chunk.svg new file mode 100644 index 0000000000000..b38490207b556 --- /dev/null +++ b/docs/reference/query-languages/esql/images/functions/chunk.svg @@ -0,0 +1 @@ +CHUNK(field,num_chunks,chunk_size) \ No newline at end of file diff --git a/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json b/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json new file mode 100644 index 0000000000000..3b8b98cda40d9 --- /dev/null +++ b/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json @@ -0,0 +1,61 @@ +{ + "comment" : "This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it.", + "type" : "scalar", + "name" : "chunk", + "description" : "Use `CHUNK` to split a text field into smaller chunks.", + "signatures" : [ + { + "params" : [ + { + "name" : "field", + "type" : "keyword", + "optional" : false, + "description" : "The input to chunk." + }, + { + "name" : "num_chunks", + "type" : "integer", + "optional" : true, + "description" : "The number of chunks to return. Defaults to return all chunks." + }, + { + "name" : "chunk_size", + "type" : "integer", + "optional" : true, + "description" : "The size of sentence-based chunks to use. Defaults to 300" + } + ], + "variadic" : false, + "returnType" : "keyword" + }, + { + "params" : [ + { + "name" : "field", + "type" : "text", + "optional" : false, + "description" : "The input to chunk." + }, + { + "name" : "num_chunks", + "type" : "integer", + "optional" : true, + "description" : "The number of chunks to return. Defaults to return all chunks." + }, + { + "name" : "chunk_size", + "type" : "integer", + "optional" : true, + "description" : "The size of sentence-based chunks to use. Defaults to 300" + } + ], + "variadic" : false, + "returnType" : "keyword" + } + ], + "examples" : [ + "FROM books\n| EVAL chunks = CHUNK(description, 1, 20)" + ], + "preview" : true, + "snapshot_only" : true +} diff --git a/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md b/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md new file mode 100644 index 0000000000000..2ecd49ef0b487 --- /dev/null +++ b/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md @@ -0,0 +1,9 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +### CHUNK +Use `CHUNK` to split a text field into smaller chunks. + +```esql +FROM books +| EVAL chunks = CHUNK(description, 1, 20) +``` diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index 3bf1367f17bc0..238ea7e99f122 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -13,7 +13,6 @@ FROM books | SORT book_no | LIMIT 5 ; -warningRegex:java.lang.IllegalArgumentException: single-value function encountered multi-value // tag::chunk-with-field-result[] book_no:keyword | title:text | chunks:keyword @@ -25,6 +24,23 @@ book_no:keyword | title:text | chunks // end::chunk-with-field-result[] ; +chunkDefaults +required_capability: chunk_function + +FROM books +| EVAL chunks = CHUNK(description) +| KEEP book_no, title, chunks +| SORT book_no +| LIMIT 5 +; + +book_no:keyword | title:text | chunks:keyword +1211 | The brothers Karamazov | In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life. Compelling, profound, complex, it is the story of a patricide and of the four sons who each had a motive for murder: Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, and twisted, cunning Smerdyakov, the bastard child. Frequently lurid, nightmarish, always brilliant, the novel plunges the reader into a sordid love triangle, a pathological obsession, and a gripping courtroom drama. But throughout the whole, Dostoevsky searhes for the truth--about man, about life, about the existence of God. A terrifying answer to man's eternal questions, this monumental work remains the crowning achievement of perhaps the finest novelist of all time. From the Paperback edition. +1463 | Realms of Tolkien: Images of Middle-earth | Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 different scenes, each of which is accompanied by appropriate passage from The Hobbit and The Lord of the Rings and The Silmarillion +1502 | Selected Passages from Correspondence with Friends | Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank. Many are reproduced here (the letters, not noses). +1937 | The Best Short Stories of Dostoevsky (Modern Library) | This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept at the short story as with the novel. Exploring many of the same themes as in his longer works, these small masterpieces move from the tender and romantic White Nights, an archetypal nineteenth-century morality tale of pathos and loss, to the famous Notes from the Underground, a story of guilt, ineffectiveness, and uncompromising cynicism, and the first major work of existential literature. Among Dostoevsky's prototypical characters is Yemelyan in The Honest Thief, whose tragedy turns on an inability to resist crime. Presented in chronological order, in David Magarshack's celebrated translation, this is the definitive edition of Dostoevsky's best stories. +1985 | Brothers Karamazov | Four brothers reunite in their hometown in Russia. The murder of their father forces the brothers to question their beliefs about each other, religion, and morality. +; chunkTextWithMatch required_capability: chunk_function @@ -34,12 +50,47 @@ FROM books | EVAL chunks = CHUNK(description, 1, 20) | KEEP book_no, title, chunks; ignoreOrder:true -warningRegex:java.lang.IllegalArgumentException: single-value function encountered multi-value book_no:keyword | title:text | chunks:keyword 2714 | Return of the King Being the Third Part of The Lord of the Rings | Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of 7350 | Return of the Shadow | In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings. ; +chunkTextWithMatchMultipleChunks +required_capability: chunk_function + +FROM books +| WHERE MATCH(title, "Return") +| EVAL chunks = CHUNK(description, 3, 20) +| KEEP book_no, title, chunks; +ignoreOrder:true + +book_no:keyword | title:text | chunks:keyword +2714 | Return of the King Being the Third Part of The Lord of the Rings | [Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of, part of Tolkien s epic masterpiece, The Lord of the Rings, featuring an exclusive cover image from the film, the, , featuring an exclusive cover image from the film, the definitive text, and a detailed map of Middle-earth.] +7350 | Return of the Shadow | [In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings., In The Return of the Shadow (an abandoned title for the first volume) Christopher Tolkien describes, with full citation of, first volume) Christopher Tolkien describes, with full citation of the earliest notes, outline plans, and narrative drafts, the intricate evolution] +; + +chunkTextWithMatchMultipleChunksMvExpand +required_capability: chunk_function + +FROM books +| WHERE MATCH(title, "Return") +| EVAL chunks = CHUNK(description, 3, 20) +| MV_EXPAND chunks +| KEEP book_no, title, chunks; +ignoreOrder:true + +book_no:keyword | title:text | chunks:keyword +2714 | Return of the King Being the Third Part of The Lord of the Rings | , featuring an exclusive cover image from the film, the definitive text, and a detailed map of Middle-earth. +2714 | Return of the King Being the Third Part of The Lord of the Rings | Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of +2714 | Return of the King Being the Third Part of The Lord of the Rings | part of Tolkien s epic masterpiece, The Lord of the Rings, featuring an exclusive cover image from the film, the +7350 | Return of the Shadow | In The Return of the Shadow (an abandoned title for the first volume) Christopher Tolkien describes, with full citation of +7350 | Return of the Shadow | In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings. +7350 | Return of the Shadow | first volume) Christopher Tolkien describes, with full citation of the earliest notes, outline plans, and narrative drafts, the intricate evolution +; + + + + diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java index 6cf9942393e90..ed3e581175987 100644 --- a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java +++ b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java @@ -123,12 +123,7 @@ public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock nu BytesRef str = strBlock.getBytesRef(strBlock.getFirstValueIndex(p), strScratch); int numChunks = numChunksBlock.getInt(numChunksBlock.getFirstValueIndex(p)); int chunkSize = chunkSizeBlock.getInt(chunkSizeBlock.getFirstValueIndex(p)); - try { - Chunk.process(result, str, numChunks, chunkSize); - } catch (IllegalArgumentException e) { - warnings().registerException(e); - result.appendNull(); - } + Chunk.process(result, str, numChunks, chunkSize); } return result.build(); } @@ -142,12 +137,7 @@ public BytesRefBlock eval(int positionCount, BytesRefVector strVector, IntVector BytesRef str = strVector.getBytesRef(p, strScratch); int numChunks = numChunksVector.getInt(p); int chunkSize = chunkSizeVector.getInt(p); - try { - Chunk.process(result, str, numChunks, chunkSize); - } catch (IllegalArgumentException e) { - warnings().registerException(e); - result.appendNull(); - } + Chunk.process(result, str, numChunks, chunkSize); } return result.build(); } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 1bdce7b07e291..76adfeb67f4f5 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -42,8 +42,8 @@ public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Chunk", Chunk::new); - private static final int DEFAULT_NUM_CHUNKS = -1; - private static final int DEFAULT_CHUNK_SIZE = 300; + public static final int DEFAULT_NUM_CHUNKS = 1; + public static final int DEFAULT_CHUNK_SIZE = 300; private final Expression field, numChunks, chunkSize; @@ -51,8 +51,13 @@ public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { returnType = "keyword", preview = true, description = """ - Chunks the contents of a field.""", - examples = { @Example(file = "chunk-function", tag = "chunk-with-field", applies_to = "stack: preview 9.2.0") } + Use `CHUNK` to split a text field into smaller chunks.""", + detailedDescription = """ + Chunk can be used on fields from the text famiy like <> and <>. + Chunk will split a text field into smaller chunks, using a sentence-based chunking strategy. + The number of chunks returned, and the length of the sentences used to create the chunks can be specified. + """, + examples = { @Example(file = "chunk", tag = "chunk-with-field", applies_to = "stack: preview 9.3.0") } ) public Chunk( Source source, @@ -100,7 +105,7 @@ public String getWriteableName() { @Override public DataType dataType() { - return field.dataType().noText(); + return DataType.KEYWORD; } @Override @@ -144,18 +149,19 @@ Expression chunkSize() { return chunkSize; } - @Evaluator(extraName = "BytesRef", warnExceptions = IllegalArgumentException.class) + @Evaluator(extraName = "BytesRef") static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, int chunkSize) { + if (numChunks < 0) { + throw new IllegalArgumentException("Num chunks parameter cannot be negative, found [" + numChunks + "]"); + } + if (chunkSize < 0) { + throw new IllegalArgumentException("Chunk size parameter cannot be negative, found [" + chunkSize + "]"); + } + String content = str.utf8ToString(); ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSize, 0); - Chunker chunker = ChunkerBuilder.fromChunkingStrategy(settings.getChunkingStrategy()); - - List chunks = chunker.chunk(content, settings) - .stream() - .map(offset -> content.substring(offset.start(), offset.end())) - .limit(numChunks > 0 ? numChunks : Long.MAX_VALUE) - .toList(); + List chunks = chunkText(content, settings, numChunks); boolean multivalued = chunks.size() > 1; if (multivalued) { @@ -170,6 +176,16 @@ static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, } } + public static List chunkText(String content, ChunkingSettings chunkingSettings, int numChunks) { + Chunker chunker = ChunkerBuilder.fromChunkingStrategy(chunkingSettings.getChunkingStrategy()); + + return chunker.chunk(content, chunkingSettings) + .stream() + .map(offset -> content.substring(offset.start(), offset.end())) + .limit(numChunks > 0 ? numChunks : Long.MAX_VALUE) + .toList(); + } + @Override public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractScalarFunctionTestCase.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractScalarFunctionTestCase.java index 0c416a5ae1427..211ef5c9b62d9 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractScalarFunctionTestCase.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractScalarFunctionTestCase.java @@ -105,6 +105,9 @@ public final void testEvaluate() { throw new AssertionError("expected resolved " + resolution.message()); } expression = new FoldNull().rule(expression, unboundLogicalOptimizerContext()); + if (expression.dataType() != testCase.expectedType()) { + logger.info("Expression after null folding: " + expression); + } assertThat(expression.dataType(), equalTo(testCase.expectedType())); logger.info("Result type: " + expression.dataType()); diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java new file mode 100644 index 0000000000000..34816478b9871 --- /dev/null +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java @@ -0,0 +1,182 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function.scalar.string; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.compute.data.Block; +import org.elasticsearch.compute.operator.EvalOperator; +import org.elasticsearch.inference.ChunkingSettings; +import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; +import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.Literal; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.esql.expression.function.AbstractScalarFunctionTestCase; +import org.elasticsearch.xpack.esql.expression.function.TestCaseSupplier; +import java.util.List; +import java.util.function.Supplier; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +import static org.elasticsearch.compute.data.BlockUtils.toJavaObject; +import static org.hamcrest.Matchers.containsString; +import static org.hamcrest.Matchers.equalTo; + +public class ChunkTests extends AbstractScalarFunctionTestCase { + + private static String PARAGRAPH_INPUT = """ + The Adirondacks, a vast mountain region in northern New York, offer a breathtaking mix of rugged wilderness, serene lakes, + and charming small towns. Spanning over six million acres, the Adirondack Park is larger than Yellowstone, Yosemite, and the + Grand Canyon combined, yet it’s dotted with communities where people live, work, and play amidst nature. Visitors come year-round + to hike High Peaks trails, paddle across mirror-like waters, or ski through snow-covered forests. The area’s pristine beauty, + rich history, and commitment to conservation create a unique balance between wild preservation and human presence, making + the Adirondacks a timeless escape into natural tranquility. + """; + + public ChunkTests(@Name("TestCase") Supplier testCaseSupplier) { + this.testCase = testCaseSupplier.get(); + } + + private static String randomWordsBetween(int min, int max) { + return IntStream.range(0, randomIntBetween(min, max)) + .mapToObj(i -> randomAlphaOfLengthBetween(1, 10)) + .collect(Collectors.joining(" ")); + } + + @ParametersFactory + public static Iterable parameters() { + return parameterSuppliersFromTypedDataWithDefaultChecks( + true, + List.of(new TestCaseSupplier("Chunk basic test", List.of(DataType.KEYWORD, DataType.INTEGER, DataType.INTEGER), () -> { + String text = randomWordsBetween(25, 50); + int numChunks = between(1, 5); + int chunkSize = between(10, 20); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + + List chunks = Chunk.chunkText(text, chunkingSettings, numChunks); + Object expectedResult = chunks.size() == 1 + ? new BytesRef(chunks.get(0).trim()) + : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); + + return new TestCaseSupplier.TestCase( + List.of( + new TestCaseSupplier.TypedData(new BytesRef(text), DataType.KEYWORD, "str"), + new TestCaseSupplier.TypedData(numChunks, DataType.INTEGER, "num_chunks"), + new TestCaseSupplier.TypedData(chunkSize, DataType.INTEGER, "chunk_size") + ), + "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=Attribute[channel=1], chunkSize=Attribute[channel=2]]", + DataType.KEYWORD, + equalTo(expectedResult) + ); + }), + new TestCaseSupplier("Chunk basic test with text input", List.of(DataType.TEXT, DataType.INTEGER, DataType.INTEGER), () -> { + String text = randomWordsBetween(25, 50); + int numChunks = between(1, 5); + int chunkSize = between(10, 20); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + + List chunks = Chunk.chunkText(text, chunkingSettings, numChunks); + Object expectedResult = chunks.size() == 1 + ? new BytesRef(chunks.get(0).trim()) + : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); + + return new TestCaseSupplier.TestCase( + List.of( + new TestCaseSupplier.TypedData(new BytesRef(text), DataType.TEXT, "str"), + new TestCaseSupplier.TypedData(numChunks, DataType.INTEGER, "num_chunks"), + new TestCaseSupplier.TypedData(chunkSize, DataType.INTEGER, "chunk_size") + ), + "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=Attribute[channel=1], chunkSize=Attribute[channel=2]]", + DataType.KEYWORD, + equalTo(expectedResult) + ); + }) + ) + ); + } + + @Override + protected Expression build(Source source, List args) { + return new Chunk(source, args.get(0), args.size() < 2 ? null : args.get(1), args.size() < 3 ? null : args.get(2)); + } + + public void testNegativeNumChunks() { + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> process("a tiger", -1, 10)); + assertThat(ex.getMessage(), containsString("Num chunks parameter cannot be negative, found [-1]")); + } + + public void testNegativeChunkSize() { + IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> process("a tiger", 1, -1)); + assertThat(ex.getMessage(), containsString("Chunk size parameter cannot be negative, found [-1]")); + } + + public void testDefaults() { + ChunkingSettings settings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); + List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, Chunk.DEFAULT_NUM_CHUNKS) + .stream() + .map(String::trim) + .toList(); + + List result = process(PARAGRAPH_INPUT, null, null); + assertThat(result, equalTo(expected)); + } + + public void testDefaultNumChunks() { + int chunkSize = randomIntBetween(20, 30); + ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, Chunk.DEFAULT_NUM_CHUNKS) + .stream() + .map(String::trim) + .toList(); + + List result = process(PARAGRAPH_INPUT, null, chunkSize); + assertThat(result, equalTo(expected)); + } + + public void testDefaultChunkSize() { + int numChunks = randomIntBetween(1, 3); + ChunkingSettings settings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); + List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, numChunks) + .stream() + .map(String::trim) + .toList(); + + List result = process(PARAGRAPH_INPUT, numChunks, null); + assertThat(result, equalTo(expected)); + } + + private List process(String str, Integer numChunks, Integer chunkSize) { + try ( + EvalOperator.ExpressionEvaluator eval = evaluator( + new Chunk( + Source.EMPTY, + field("str", DataType.KEYWORD), + numChunks == null ? null : new Literal(Source.EMPTY, numChunks, DataType.INTEGER), + chunkSize == null ? null : new Literal(Source.EMPTY, chunkSize, DataType.INTEGER) + ) + ).get(driverContext()); + Block block = eval.eval(row(List.of(new BytesRef(str)))) + ) { + if (block.isNull(0)) { + return null; + } + Object result = toJavaObject(block, 0); + if (result instanceof BytesRef bytesRef) { + return List.of(bytesRef.utf8ToString()); + } else { + @SuppressWarnings("unchecked") + List list = (List) result; + return list.stream().map(BytesRef::utf8ToString).toList(); + } + } + } + +} From 46279f0c64633e84a2562ea86c5426e6f83e1d05 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Wed, 22 Oct 2025 13:19:13 -0400 Subject: [PATCH 14/29] Cleanup --- server/src/main/java/module-info.java | 2 +- .../expression/function/AbstractScalarFunctionTestCase.java | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 039e717c823ab..fd50628539ebd 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -488,7 +488,7 @@ exports org.elasticsearch.inference.configuration; exports org.elasticsearch.inference.validation; exports org.elasticsearch.monitor.metrics; - exports org.elasticsearch.plugins.internal.rewriter; + exports org.elasticsearch.plugins.internal.rewriter to org.elasticsearch.inference; exports org.elasticsearch.lucene.util.automaton; exports org.elasticsearch.index.codec.perfield; exports org.elasticsearch.index.codec.vectors to org.elasticsearch.test.knn, org.elasticsearch.gpu; diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractScalarFunctionTestCase.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractScalarFunctionTestCase.java index 211ef5c9b62d9..0c416a5ae1427 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractScalarFunctionTestCase.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/AbstractScalarFunctionTestCase.java @@ -105,9 +105,6 @@ public final void testEvaluate() { throw new AssertionError("expected resolved " + resolution.message()); } expression = new FoldNull().rule(expression, unboundLogicalOptimizerContext()); - if (expression.dataType() != testCase.expectedType()) { - logger.info("Expression after null folding: " + expression); - } assertThat(expression.dataType(), equalTo(testCase.expectedType())); logger.info("Result type: " + expression.dataType()); From 90deac7c0870bfb46ceb1bb541452839f9655dce Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 22 Oct 2025 17:25:41 +0000 Subject: [PATCH 15/29] [CI] Auto commit changes from spotless --- .../function/scalar/string/Chunk.java | 12 +-- .../function/scalar/string/ChunkTests.java | 102 ++++++++---------- 2 files changed, 49 insertions(+), 65 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 76adfeb67f4f5..c8abcfaa3b4a4 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -47,18 +47,12 @@ public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { private final Expression field, numChunks, chunkSize; - @FunctionInfo( - returnType = "keyword", - preview = true, - description = """ - Use `CHUNK` to split a text field into smaller chunks.""", - detailedDescription = """ + @FunctionInfo(returnType = "keyword", preview = true, description = """ + Use `CHUNK` to split a text field into smaller chunks.""", detailedDescription = """ Chunk can be used on fields from the text famiy like <> and <>. Chunk will split a text field into smaller chunks, using a sentence-based chunking strategy. The number of chunks returned, and the length of the sentences used to create the chunks can be specified. - """, - examples = { @Example(file = "chunk", tag = "chunk-with-field", applies_to = "stack: preview 9.3.0") } - ) + """, examples = { @Example(file = "chunk", tag = "chunk-with-field", applies_to = "stack: preview 9.3.0") }) public Chunk( Source source, @Param(name = "field", type = { "keyword", "text" }, description = "The input to chunk.") Expression field, diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java index 34816478b9871..a7feea698f696 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java @@ -21,6 +21,7 @@ import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.expression.function.AbstractScalarFunctionTestCase; import org.elasticsearch.xpack.esql.expression.function.TestCaseSupplier; + import java.util.List; import java.util.function.Supplier; import java.util.stream.Collectors; @@ -56,50 +57,48 @@ public static Iterable parameters() { return parameterSuppliersFromTypedDataWithDefaultChecks( true, List.of(new TestCaseSupplier("Chunk basic test", List.of(DataType.KEYWORD, DataType.INTEGER, DataType.INTEGER), () -> { - String text = randomWordsBetween(25, 50); - int numChunks = between(1, 5); - int chunkSize = between(10, 20); - ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); - - List chunks = Chunk.chunkText(text, chunkingSettings, numChunks); - Object expectedResult = chunks.size() == 1 - ? new BytesRef(chunks.get(0).trim()) - : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); - - return new TestCaseSupplier.TestCase( - List.of( - new TestCaseSupplier.TypedData(new BytesRef(text), DataType.KEYWORD, "str"), - new TestCaseSupplier.TypedData(numChunks, DataType.INTEGER, "num_chunks"), - new TestCaseSupplier.TypedData(chunkSize, DataType.INTEGER, "chunk_size") - ), - "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=Attribute[channel=1], chunkSize=Attribute[channel=2]]", - DataType.KEYWORD, - equalTo(expectedResult) - ); - }), - new TestCaseSupplier("Chunk basic test with text input", List.of(DataType.TEXT, DataType.INTEGER, DataType.INTEGER), () -> { - String text = randomWordsBetween(25, 50); - int numChunks = between(1, 5); - int chunkSize = between(10, 20); - ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); - - List chunks = Chunk.chunkText(text, chunkingSettings, numChunks); - Object expectedResult = chunks.size() == 1 - ? new BytesRef(chunks.get(0).trim()) - : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); - - return new TestCaseSupplier.TestCase( - List.of( - new TestCaseSupplier.TypedData(new BytesRef(text), DataType.TEXT, "str"), - new TestCaseSupplier.TypedData(numChunks, DataType.INTEGER, "num_chunks"), - new TestCaseSupplier.TypedData(chunkSize, DataType.INTEGER, "chunk_size") - ), - "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=Attribute[channel=1], chunkSize=Attribute[channel=2]]", - DataType.KEYWORD, - equalTo(expectedResult) - ); - }) - ) + String text = randomWordsBetween(25, 50); + int numChunks = between(1, 5); + int chunkSize = between(10, 20); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + + List chunks = Chunk.chunkText(text, chunkingSettings, numChunks); + Object expectedResult = chunks.size() == 1 + ? new BytesRef(chunks.get(0).trim()) + : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); + + return new TestCaseSupplier.TestCase( + List.of( + new TestCaseSupplier.TypedData(new BytesRef(text), DataType.KEYWORD, "str"), + new TestCaseSupplier.TypedData(numChunks, DataType.INTEGER, "num_chunks"), + new TestCaseSupplier.TypedData(chunkSize, DataType.INTEGER, "chunk_size") + ), + "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=Attribute[channel=1], chunkSize=Attribute[channel=2]]", + DataType.KEYWORD, + equalTo(expectedResult) + ); + }), new TestCaseSupplier("Chunk basic test with text input", List.of(DataType.TEXT, DataType.INTEGER, DataType.INTEGER), () -> { + String text = randomWordsBetween(25, 50); + int numChunks = between(1, 5); + int chunkSize = between(10, 20); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + + List chunks = Chunk.chunkText(text, chunkingSettings, numChunks); + Object expectedResult = chunks.size() == 1 + ? new BytesRef(chunks.get(0).trim()) + : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); + + return new TestCaseSupplier.TestCase( + List.of( + new TestCaseSupplier.TypedData(new BytesRef(text), DataType.TEXT, "str"), + new TestCaseSupplier.TypedData(numChunks, DataType.INTEGER, "num_chunks"), + new TestCaseSupplier.TypedData(chunkSize, DataType.INTEGER, "chunk_size") + ), + "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=Attribute[channel=1], chunkSize=Attribute[channel=2]]", + DataType.KEYWORD, + equalTo(expectedResult) + ); + })) ); } @@ -120,10 +119,7 @@ public void testNegativeChunkSize() { public void testDefaults() { ChunkingSettings settings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); - List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, Chunk.DEFAULT_NUM_CHUNKS) - .stream() - .map(String::trim) - .toList(); + List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, Chunk.DEFAULT_NUM_CHUNKS).stream().map(String::trim).toList(); List result = process(PARAGRAPH_INPUT, null, null); assertThat(result, equalTo(expected)); @@ -132,10 +128,7 @@ public void testDefaults() { public void testDefaultNumChunks() { int chunkSize = randomIntBetween(20, 30); ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSize, 0); - List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, Chunk.DEFAULT_NUM_CHUNKS) - .stream() - .map(String::trim) - .toList(); + List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, Chunk.DEFAULT_NUM_CHUNKS).stream().map(String::trim).toList(); List result = process(PARAGRAPH_INPUT, null, chunkSize); assertThat(result, equalTo(expected)); @@ -144,10 +137,7 @@ public void testDefaultNumChunks() { public void testDefaultChunkSize() { int numChunks = randomIntBetween(1, 3); ChunkingSettings settings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); - List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, numChunks) - .stream() - .map(String::trim) - .toList(); + List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, numChunks).stream().map(String::trim).toList(); List result = process(PARAGRAPH_INPUT, numChunks, null); assertThat(result, equalTo(expected)); From 7c78c32f091d6cb94f9311a418eaf7fc113a1d9d Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Wed, 22 Oct 2025 13:28:48 -0400 Subject: [PATCH 16/29] Cleanup --- .../xpack/esql/expression/function/scalar/string/Chunk.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index c8abcfaa3b4a4..fc49cf1a9a987 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -42,7 +42,7 @@ public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Chunk", Chunk::new); - public static final int DEFAULT_NUM_CHUNKS = 1; + public static final int DEFAULT_NUM_CHUNKS = Integer.MAX_VALUE; public static final int DEFAULT_CHUNK_SIZE = 300; private final Expression field, numChunks, chunkSize; From 9c9b37369ad81f2f2e3fe6ae5f28813f75182e52 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Wed, 22 Oct 2025 13:30:03 -0400 Subject: [PATCH 17/29] Update docs/changelog/134320.yaml --- docs/changelog/134320.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/134320.yaml diff --git a/docs/changelog/134320.yaml b/docs/changelog/134320.yaml new file mode 100644 index 0000000000000..d20abd2bcb737 --- /dev/null +++ b/docs/changelog/134320.yaml @@ -0,0 +1,5 @@ +pr: 134320 +summary: Add CHUNK function +area: ES|QL +type: enhancement +issues: [] From db585e6e4fe8b941267c1d599e57f9bc6ddbfed6 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Thu, 23 Oct 2025 13:41:10 -0400 Subject: [PATCH 18/29] PR feedback --- .../function/scalar/string/Chunk.java | 25 ++++++++++++++++++- .../xpack/esql/analysis/VerifierTests.java | 22 ++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index fc49cf1a9a987..f65aca695de02 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -20,6 +20,7 @@ import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Literal; +import org.elasticsearch.xpack.esql.core.expression.TypeResolutions; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; @@ -36,7 +37,11 @@ import java.util.Objects; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.THIRD; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; +import static org.elasticsearch.xpack.esql.core.type.DataType.INTEGER; public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { @@ -108,7 +113,25 @@ protected TypeResolution resolveType() { return new TypeResolution("Unresolved children"); } - return isString(field(), sourceText(), FIRST); + TypeResolution resolution = isString(field(), sourceText(), FIRST); + if (resolution.unresolved()) { + return resolution; + } + resolution = isNotNull(field(), sourceText(), FIRST); + if (resolution.unresolved()) { + return resolution; + } + + if (numChunks() != null) { + resolution = TypeResolutions.isType(numChunks(), dt -> dt == INTEGER, sourceText(), SECOND, "integer"); + if (resolution.unresolved()) { + return resolution; + } + } + + return chunkSize() == null + ? TypeResolution.TYPE_RESOLVED + : TypeResolutions.isType(chunkSize(), dt -> dt == INTEGER, sourceText(), THIRD, "integer"); } @Override diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 7a3b294be2ab3..5ea604d1761d2 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -2813,6 +2813,28 @@ STATS max(max_over_time(network.connections)) BY host, time_bucket = bucket(@tim can only be used after STATS when used with TS command""")); } + public void testChunkFunctionWithNullInputs() { + query("from test | eval chunks = chunk(body, null, 20)", fullTextAnalyzer); + query("from test | eval chunks = chunk(body, 5, null)", fullTextAnalyzer); + } + + public void testChunkFunctionInvalidInputs() { + if (EsqlCapabilities.Cap.CHUNK_FUNCTION.isEnabled()) { + assertThat( + error("from test | EVAL chunks = CHUNK(null)", fullTextAnalyzer), + equalTo("1:27: first argument of [CHUNK(null)] cannot be null, received [null]") + ); + assertThat( + error("from test | EVAL chunks = CHUNK(body, \"foo\", 20)", fullTextAnalyzer), + equalTo("1:39: Cannot convert string [foo] to [INTEGER], error [Cannot parse number [foo]]") + ); + assertThat( + error("from test | EVAL chunks = CHUNK(body, 5, \"foo\")", fullTextAnalyzer), + equalTo("1:42: Cannot convert string [foo] to [INTEGER], error [Cannot parse number [foo]]") + ); + } + } + private void checkVectorFunctionsNullArgs(String functionInvocation) throws Exception { query("from test | eval similarity = " + functionInvocation, fullTextAnalyzer); } From c7edb7301a181acbb0ebf44110d0432dfb75ec67 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Thu, 23 Oct 2025 14:39:45 -0400 Subject: [PATCH 19/29] Remove null field constraint --- .../xpack/esql/expression/function/scalar/string/Chunk.java | 4 ---- .../org/elasticsearch/xpack/esql/analysis/VerifierTests.java | 5 +---- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index f65aca695de02..50dab5ae024a2 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -117,10 +117,6 @@ protected TypeResolution resolveType() { if (resolution.unresolved()) { return resolution; } - resolution = isNotNull(field(), sourceText(), FIRST); - if (resolution.unresolved()) { - return resolution; - } if (numChunks() != null) { resolution = TypeResolutions.isType(numChunks(), dt -> dt == INTEGER, sourceText(), SECOND, "integer"); diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 5ea604d1761d2..3a0d20c223cfa 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -2816,14 +2816,11 @@ STATS max(max_over_time(network.connections)) BY host, time_bucket = bucket(@tim public void testChunkFunctionWithNullInputs() { query("from test | eval chunks = chunk(body, null, 20)", fullTextAnalyzer); query("from test | eval chunks = chunk(body, 5, null)", fullTextAnalyzer); + query("from test | eval chunks = chunk(null, 5, 20)", fullTextAnalyzer); } public void testChunkFunctionInvalidInputs() { if (EsqlCapabilities.Cap.CHUNK_FUNCTION.isEnabled()) { - assertThat( - error("from test | EVAL chunks = CHUNK(null)", fullTextAnalyzer), - equalTo("1:27: first argument of [CHUNK(null)] cannot be null, received [null]") - ); assertThat( error("from test | EVAL chunks = CHUNK(body, \"foo\", 20)", fullTextAnalyzer), equalTo("1:39: Cannot convert string [foo] to [INTEGER], error [Cannot parse number [foo]]") From c435bf4e08a566fe3bbeb5fed5a6c72a67aa30cc Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 23 Oct 2025 18:46:48 +0000 Subject: [PATCH 20/29] [CI] Auto commit changes from spotless --- .../xpack/esql/expression/function/scalar/string/Chunk.java | 1 - 1 file changed, 1 deletion(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 50dab5ae024a2..c8db3ecbde644 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -39,7 +39,6 @@ import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.THIRD; -import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isNotNull; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; import static org.elasticsearch.xpack.esql.core.type.DataType.INTEGER; From 84b84000e3aec4a551648f1b3171b7f5e48b6e02 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 24 Oct 2025 13:35:26 -0400 Subject: [PATCH 21/29] PR feedback: Refactor to use an options map --- .../functions/functionNamedParams/chunk.md | 10 ++ .../esql/_snippets/functions/layout/chunk.md | 3 + .../_snippets/functions/parameters/chunk.md | 7 +- .../esql/_snippets/functions/types/chunk.md | 8 +- .../esql/images/functions/chunk.svg | 2 +- .../kibana/definition/functions/chunk.json | 24 --- .../src/main/resources/chunk.csv-spec | 8 +- .../function/EsqlFunctionRegistry.java | 2 +- .../function/scalar/string/Chunk.java | 142 +++++++++--------- .../xpack/esql/analysis/VerifierTests.java | 38 +++-- .../function/scalar/string/ChunkTests.java | 125 ++++++++------- 11 files changed, 192 insertions(+), 177 deletions(-) create mode 100644 docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md diff --git a/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md new file mode 100644 index 0000000000000..265551c8bee8a --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md @@ -0,0 +1,10 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Supported function named parameters** + +`num_chunks` +: (integer) The number of chunks to return. Defaults to return all chunks. + +`chunk_size` +: (integer) The size of sentence-based chunks to use. Defaults to 300 + diff --git a/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md index 9ab96985aa35a..a3e67be49499a 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md @@ -19,5 +19,8 @@ :::{include} ../types/chunk.md ::: +:::{include} ../functionNamedParams/chunk.md +::: + :::{include} ../examples/chunk.md ::: diff --git a/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md index 755ab69ce2224..ce86379c1254c 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md @@ -5,9 +5,6 @@ `field` : The input to chunk. -`num_chunks` -: The number of chunks to return. Defaults to return all chunks. - -`chunk_size` -: The size of sentence-based chunks to use. Defaults to 300 +`options` +: TODO diff --git a/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md index cea4a8217c3b0..8ebe22b61286c 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md @@ -2,8 +2,8 @@ **Supported types** -| field | num_chunks | chunk_size | result | -| --- | --- | --- | --- | -| keyword | integer | integer | keyword | -| text | integer | integer | keyword | +| field | options | result | +| --- | --- | --- | +| keyword | | keyword | +| text | | keyword | diff --git a/docs/reference/query-languages/esql/images/functions/chunk.svg b/docs/reference/query-languages/esql/images/functions/chunk.svg index b38490207b556..56003f305a080 100644 --- a/docs/reference/query-languages/esql/images/functions/chunk.svg +++ b/docs/reference/query-languages/esql/images/functions/chunk.svg @@ -1 +1 @@ -CHUNK(field,num_chunks,chunk_size) \ No newline at end of file +CHUNK(field,options) \ No newline at end of file diff --git a/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json b/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json index 3b8b98cda40d9..f2d9af38d0e15 100644 --- a/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json +++ b/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json @@ -11,18 +11,6 @@ "type" : "keyword", "optional" : false, "description" : "The input to chunk." - }, - { - "name" : "num_chunks", - "type" : "integer", - "optional" : true, - "description" : "The number of chunks to return. Defaults to return all chunks." - }, - { - "name" : "chunk_size", - "type" : "integer", - "optional" : true, - "description" : "The size of sentence-based chunks to use. Defaults to 300" } ], "variadic" : false, @@ -35,18 +23,6 @@ "type" : "text", "optional" : false, "description" : "The input to chunk." - }, - { - "name" : "num_chunks", - "type" : "integer", - "optional" : true, - "description" : "The number of chunks to return. Defaults to return all chunks." - }, - { - "name" : "chunk_size", - "type" : "integer", - "optional" : true, - "description" : "The size of sentence-based chunks to use. Defaults to 300" } ], "variadic" : false, diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index 238ea7e99f122..96ff691708faa 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -7,7 +7,7 @@ required_capability: chunk_function // tag::chunk-with-field[] FROM books -| EVAL chunks = CHUNK(description, 1, 20) +| EVAL chunks = CHUNK(description, {"num_chunks":1, "chunk_size":20}) // end::chunk-with-field[] | KEEP book_no, title, chunks | SORT book_no @@ -47,7 +47,7 @@ required_capability: chunk_function FROM books | WHERE MATCH(title, "Return") -| EVAL chunks = CHUNK(description, 1, 20) +| EVAL chunks = CHUNK(description, {"num_chunks":1, "chunk_size":20}) | KEEP book_no, title, chunks; ignoreOrder:true @@ -61,7 +61,7 @@ required_capability: chunk_function FROM books | WHERE MATCH(title, "Return") -| EVAL chunks = CHUNK(description, 3, 20) +| EVAL chunks = CHUNK(description, {"num_chunks":3, "chunk_size":20}) | KEEP book_no, title, chunks; ignoreOrder:true @@ -75,7 +75,7 @@ required_capability: chunk_function FROM books | WHERE MATCH(title, "Return") -| EVAL chunks = CHUNK(description, 3, 20) +| EVAL chunks = CHUNK(description, {"num_chunks":3, "chunk_size":20}) | MV_EXPAND chunks | KEEP book_no, title, chunks; ignoreOrder:true diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java index 799ecb8bd3f3e..c138fa2418b26 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java @@ -557,7 +557,7 @@ private static FunctionDefinition[][] snapshotFunctions() { def(L2Norm.class, L2Norm::new, "v_l2_norm"), def(Magnitude.class, Magnitude::new, "v_magnitude"), def(Hamming.class, Hamming::new, "v_hamming"), - def(Chunk.class, tri(Chunk::new), "chunk") } }; + def(Chunk.class, bi(Chunk::new), "chunk") } }; } public EsqlFunctionRegistry snapshotRegistry() { diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index c8db3ecbde644..03a08c8261c05 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -18,38 +18,44 @@ import org.elasticsearch.xpack.core.inference.chunking.Chunker; import org.elasticsearch.xpack.core.inference.chunking.ChunkerBuilder; import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; +import org.elasticsearch.xpack.esql.core.InvalidArgumentException; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Literal; -import org.elasticsearch.xpack.esql.core.expression.TypeResolutions; +import org.elasticsearch.xpack.esql.core.expression.MapExpression; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.expression.function.Example; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; +import org.elasticsearch.xpack.esql.expression.function.MapParam; +import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; +import org.elasticsearch.xpack.esql.expression.function.Options; import org.elasticsearch.xpack.esql.expression.function.Param; -import org.elasticsearch.xpack.esql.expression.function.TwoOptionalArguments; import org.elasticsearch.xpack.esql.expression.function.scalar.EsqlScalarFunction; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import java.io.IOException; -import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Objects; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND; -import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.THIRD; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; -import static org.elasticsearch.xpack.esql.core.type.DataType.INTEGER; -public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { +public class Chunk extends EsqlScalarFunction implements OptionalArgument { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Chunk", Chunk::new); public static final int DEFAULT_NUM_CHUNKS = Integer.MAX_VALUE; public static final int DEFAULT_CHUNK_SIZE = 300; - private final Expression field, numChunks, chunkSize; + private final Expression field, options; + + static final String NUM_CHUNKS = "num_chunks"; + static final String CHUNK_SIZE = "chunk_size"; + + public static final Map ALLOWED_OPTIONS = Map.of(NUM_CHUNKS, DataType.INTEGER, CHUNK_SIZE, DataType.INTEGER); @FunctionInfo(returnType = "keyword", preview = true, description = """ Use `CHUNK` to split a text field into smaller chunks.""", detailedDescription = """ @@ -60,30 +66,43 @@ public class Chunk extends EsqlScalarFunction implements TwoOptionalArguments { public Chunk( Source source, @Param(name = "field", type = { "keyword", "text" }, description = "The input to chunk.") Expression field, - @Param( - optional = true, - name = "num_chunks", - type = { "integer" }, - description = "The number of chunks to return. Defaults to return all chunks." - ) Expression numChunks, - @Param( - optional = true, - name = "chunk_size", - type = { "integer" }, - description = "The size of sentence-based chunks to use. Defaults to " + DEFAULT_CHUNK_SIZE - ) Expression chunkSize + @MapParam( + name = "options", + params = { + @MapParam.MapParamEntry( + name = "num_chunks", + type = "integer", + description = "The number of chunks to return. Defaults to return all chunks." + ), + @MapParam.MapParamEntry( + name = "chunk_size", + type = "integer", + description = "The size of sentence-based chunks to use. Defaults to " + DEFAULT_CHUNK_SIZE + ), }, + description = "TODO", + optional = true + ) Expression options + ) { + super(source, options == null ? List.of(field) : List.of(field, options)); + this.field = field; + this.options = options; + } + + private Chunk( + Source source, + Expression field, + Expression options, + boolean unused // dummy parameter to differentiate constructors ) { - super(source, fields(field, numChunks, chunkSize)); + super(source, options == null ? List.of(field) : List.of(field, options)); this.field = field; - this.numChunks = numChunks; - this.chunkSize = chunkSize; + this.options = options; } public Chunk(StreamInput in) throws IOException { this( Source.readFrom((PlanStreamInput) in), in.readNamedWriteable(Expression.class), - in.readOptionalNamedWriteable(Expression.class), in.readOptionalNamedWriteable(Expression.class) ); } @@ -92,8 +111,7 @@ public Chunk(StreamInput in) throws IOException { public void writeTo(StreamOutput out) throws IOException { source().writeTo(out); out.writeNamedWriteable(field); - out.writeOptionalNamedWriteable(numChunks); - out.writeOptionalNamedWriteable(chunkSize); + out.writeOptionalNamedWriteable(options); } @Override @@ -111,27 +129,28 @@ protected TypeResolution resolveType() { if (childrenResolved() == false) { return new TypeResolution("Unresolved children"); } + return isString(field(), sourceText(), FIRST).and(Options.resolve(options, source(), SECOND, ALLOWED_OPTIONS, this::verifyOptions)); + } - TypeResolution resolution = isString(field(), sourceText(), FIRST); - if (resolution.unresolved()) { - return resolution; + private void verifyOptions(Map optionsMap) { + if (options == null) { + return; } - if (numChunks() != null) { - resolution = TypeResolutions.isType(numChunks(), dt -> dt == INTEGER, sourceText(), SECOND, "integer"); - if (resolution.unresolved()) { - return resolution; - } + Integer numChunks = (Integer) optionsMap.get(NUM_CHUNKS); + if (numChunks != null && numChunks < 0) { + throw new InvalidArgumentException("[{}] cannot be negative, found [{}]", NUM_CHUNKS, numChunks); + } + Integer chunkSize = (Integer) optionsMap.get(CHUNK_SIZE); + if (chunkSize != null && chunkSize < 0) { + throw new InvalidArgumentException("[{}] cannot be negative, found [{}]", CHUNK_SIZE, chunkSize); } - return chunkSize() == null - ? TypeResolution.TYPE_RESOLVED - : TypeResolutions.isType(chunkSize(), dt -> dt == INTEGER, sourceText(), THIRD, "integer"); } @Override public boolean foldable() { - return field().foldable() && (numChunks() == null || numChunks().foldable()) && (chunkSize() == null || chunkSize().foldable()); + return field().foldable() && (options() == null || options().foldable()); } @Override @@ -139,37 +158,25 @@ public Expression replaceChildren(List newChildren) { return new Chunk( source(), newChildren.get(0), // field - numChunks == null ? null : newChildren.get(1), - chunkSize == null ? null : newChildren.get(2) + newChildren.size() > 1 ? newChildren.get(1) : null // options ); } @Override protected NodeInfo info() { - return NodeInfo.create(this, Chunk::new, field, numChunks, chunkSize); + return NodeInfo.create(this, Chunk::new, field, options); } Expression field() { return field; } - Expression numChunks() { - return numChunks; - } - - Expression chunkSize() { - return chunkSize; + Expression options() { + return options; } @Evaluator(extraName = "BytesRef") static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, int chunkSize) { - if (numChunks < 0) { - throw new IllegalArgumentException("Num chunks parameter cannot be negative, found [" + numChunks + "]"); - } - if (chunkSize < 0) { - throw new IllegalArgumentException("Chunk size parameter cannot be negative, found [" + chunkSize + "]"); - } - String content = str.utf8ToString(); ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSize, 0); @@ -202,38 +209,27 @@ public static List chunkText(String content, ChunkingSettings chunkingSe public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; Chunk chunk = (Chunk) o; - return Objects.equals(field(), chunk.field()) - && Objects.equals(numChunks(), chunk.numChunks()) - && Objects.equals(chunkSize(), chunk.chunkSize()); + return Objects.equals(field(), chunk.field()) && Objects.equals(options(), chunk.options()); } @Override public int hashCode() { - return Objects.hash(field(), numChunks(), chunkSize()); - } - - private static List fields(Expression field, Expression numChunks, Expression chunkSize) { - List list = new ArrayList<>(4); - list.add(field); - if (numChunks != null) { - list.add(numChunks); - } - if (chunkSize != null) { - list.add(chunkSize); - } - return list; + return Objects.hash(field(), options()); } @Override public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { + + MapExpression optionsMap = options() != null ? (MapExpression) options() : null; + return new ChunkBytesRefEvaluator.Factory( source(), toEvaluator.apply(field), - numChunks != null - ? toEvaluator.apply(numChunks) + optionsMap != null && optionsMap.containsKey("num_chunks") + ? toEvaluator.apply(optionsMap.get("num_chunks")) : toEvaluator.apply(new Literal(source(), DEFAULT_NUM_CHUNKS, DataType.INTEGER)), - chunkSize != null - ? toEvaluator.apply(chunkSize) + optionsMap != null && optionsMap.containsKey("chunk_size") + ? toEvaluator.apply(optionsMap.get("chunk_size")) : toEvaluator.apply(new Literal(source(), DEFAULT_CHUNK_SIZE, DataType.INTEGER)) ); } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 3a0d20c223cfa..2d6cacd269cf1 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -2813,21 +2813,39 @@ STATS max(max_over_time(network.connections)) BY host, time_bucket = bucket(@tim can only be used after STATS when used with TS command""")); } - public void testChunkFunctionWithNullInputs() { - query("from test | eval chunks = chunk(body, null, 20)", fullTextAnalyzer); - query("from test | eval chunks = chunk(body, 5, null)", fullTextAnalyzer); - query("from test | eval chunks = chunk(null, 5, 20)", fullTextAnalyzer); - } - public void testChunkFunctionInvalidInputs() { if (EsqlCapabilities.Cap.CHUNK_FUNCTION.isEnabled()) { assertThat( - error("from test | EVAL chunks = CHUNK(body, \"foo\", 20)", fullTextAnalyzer), - equalTo("1:39: Cannot convert string [foo] to [INTEGER], error [Cannot parse number [foo]]") + error( + "from test | EVAL chunks = CHUNK(body, {\"num_chunks\": null, \"chunk_size\": 20})", + fullTextAnalyzer, + ParsingException.class + ), + equalTo("1:39: Invalid named parameter [\"num_chunks\":null], NULL is not supported") + ); + assertThat( + error( + "from test | EVAL chunks = CHUNK(body, {\"num_chunks\": 3, \"chunk_size\": null})", + fullTextAnalyzer, + ParsingException.class + ), + equalTo("1:39: Invalid named parameter [\"chunk_size\":null], NULL is not supported") + ); + assertThat( + error("from test | EVAL chunks = CHUNK(body, {\"num_chunks\":\"foo\"})", fullTextAnalyzer), + equalTo("1:27: Invalid option [num_chunks] in [CHUNK(body, {\"num_chunks\":\"foo\"})], cannot cast [foo] to [integer]") + ); + assertThat( + error("from test | EVAL chunks = CHUNK(body, {\"chunk_size\":\"foo\"})", fullTextAnalyzer), + equalTo("1:27: Invalid option [chunk_size] in [CHUNK(body, {\"chunk_size\":\"foo\"})], cannot cast [foo] to [integer]") + ); + assertThat( + error("from test | EVAL chunks = CHUNK(body, {\"num_chunks\":-1})", fullTextAnalyzer), + equalTo("1:27: [num_chunks] cannot be negative, found [-1]") ); assertThat( - error("from test | EVAL chunks = CHUNK(body, 5, \"foo\")", fullTextAnalyzer), - equalTo("1:42: Cannot convert string [foo] to [INTEGER], error [Cannot parse number [foo]]") + error("from test | EVAL chunks = CHUNK(body, {\"chunk_size\":-1})", fullTextAnalyzer), + equalTo("1:27: [chunk_size] cannot be negative, found [-1]") ); } } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java index a7feea698f696..21592b5b95424 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java @@ -17,18 +17,22 @@ import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Literal; +import org.elasticsearch.xpack.esql.core.expression.MapExpression; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.expression.function.AbstractScalarFunctionTestCase; import org.elasticsearch.xpack.esql.expression.function.TestCaseSupplier; +import java.util.ArrayList; import java.util.List; +import java.util.Objects; import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.IntStream; import static org.elasticsearch.compute.data.BlockUtils.toJavaObject; -import static org.hamcrest.Matchers.containsString; +import static org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk.CHUNK_SIZE; +import static org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk.NUM_CHUNKS; import static org.hamcrest.Matchers.equalTo; public class ChunkTests extends AbstractScalarFunctionTestCase { @@ -56,45 +60,41 @@ private static String randomWordsBetween(int min, int max) { public static Iterable parameters() { return parameterSuppliersFromTypedDataWithDefaultChecks( true, - List.of(new TestCaseSupplier("Chunk basic test", List.of(DataType.KEYWORD, DataType.INTEGER, DataType.INTEGER), () -> { + List.of(new TestCaseSupplier("Chunk with defaults", List.of(DataType.KEYWORD), () -> { String text = randomWordsBetween(25, 50); - int numChunks = between(1, 5); - int chunkSize = between(10, 20); - ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); - List chunks = Chunk.chunkText(text, chunkingSettings, numChunks); + List chunks = Chunk.chunkText(text, chunkingSettings, Chunk.DEFAULT_NUM_CHUNKS); Object expectedResult = chunks.size() == 1 ? new BytesRef(chunks.get(0).trim()) : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); return new TestCaseSupplier.TestCase( - List.of( - new TestCaseSupplier.TypedData(new BytesRef(text), DataType.KEYWORD, "str"), - new TestCaseSupplier.TypedData(numChunks, DataType.INTEGER, "num_chunks"), - new TestCaseSupplier.TypedData(chunkSize, DataType.INTEGER, "chunk_size") - ), - "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=Attribute[channel=1], chunkSize=Attribute[channel=2]]", + List.of(new TestCaseSupplier.TypedData(new BytesRef(text), DataType.KEYWORD, "str")), + "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=LiteralsEvaluator[lit=" + + Chunk.DEFAULT_NUM_CHUNKS + + "], chunkSize=LiteralsEvaluator[lit=" + + Chunk.DEFAULT_CHUNK_SIZE + + "]]", DataType.KEYWORD, equalTo(expectedResult) ); - }), new TestCaseSupplier("Chunk basic test with text input", List.of(DataType.TEXT, DataType.INTEGER, DataType.INTEGER), () -> { + }), new TestCaseSupplier("Chunk with defaults text input", List.of(DataType.TEXT), () -> { String text = randomWordsBetween(25, 50); - int numChunks = between(1, 5); - int chunkSize = between(10, 20); - ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); - List chunks = Chunk.chunkText(text, chunkingSettings, numChunks); + List chunks = Chunk.chunkText(text, chunkingSettings, Chunk.DEFAULT_NUM_CHUNKS); Object expectedResult = chunks.size() == 1 ? new BytesRef(chunks.get(0).trim()) : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); return new TestCaseSupplier.TestCase( - List.of( - new TestCaseSupplier.TypedData(new BytesRef(text), DataType.TEXT, "str"), - new TestCaseSupplier.TypedData(numChunks, DataType.INTEGER, "num_chunks"), - new TestCaseSupplier.TypedData(chunkSize, DataType.INTEGER, "chunk_size") - ), - "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=Attribute[channel=1], chunkSize=Attribute[channel=2]]", + List.of(new TestCaseSupplier.TypedData(new BytesRef(text), DataType.TEXT, "str")), + "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=LiteralsEvaluator[lit=" + + Chunk.DEFAULT_NUM_CHUNKS + + "], chunkSize=LiteralsEvaluator[lit=" + + Chunk.DEFAULT_CHUNK_SIZE + + "]]", DataType.KEYWORD, equalTo(expectedResult) ); @@ -102,57 +102,72 @@ public static Iterable parameters() { ); } + private static MapExpression createOptionsMap(Integer numChunks, Integer chunkSize) { + List keyValuePairs = new ArrayList<>(); + + if (Objects.nonNull(numChunks)) { + keyValuePairs.add(Literal.keyword(Source.EMPTY, NUM_CHUNKS)); + keyValuePairs.add(new Literal(Source.EMPTY, numChunks, DataType.INTEGER)); + } + + if (Objects.nonNull(chunkSize)) { + keyValuePairs.add(Literal.keyword(Source.EMPTY, CHUNK_SIZE)); + keyValuePairs.add(new Literal(Source.EMPTY, chunkSize, DataType.INTEGER)); + } + + return new MapExpression(Source.EMPTY, keyValuePairs); + } + @Override protected Expression build(Source source, List args) { - return new Chunk(source, args.get(0), args.size() < 2 ? null : args.get(1), args.size() < 3 ? null : args.get(2)); + // With MapParam, args contains: field, options_map + Expression options = args.size() < 2 ? null : args.get(1); + // TODO needed? + if (options instanceof Literal lit && lit.value() == null) { + options = null; + } + return new Chunk(source, args.get(0), options); } - public void testNegativeNumChunks() { - IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> process("a tiger", -1, 10)); - assertThat(ex.getMessage(), containsString("Num chunks parameter cannot be negative, found [-1]")); + public void testDefaults() { + // Default of 300 is huge, only one chunk returned in this case + verifyChunks(null, null, 1); } - public void testNegativeChunkSize() { - IllegalArgumentException ex = expectThrows(IllegalArgumentException.class, () -> process("a tiger", 1, -1)); - assertThat(ex.getMessage(), containsString("Chunk size parameter cannot be negative, found [-1]")); + public void testDefaultNumChunks() { + int chunkSize = 20; + verifyChunks(null, chunkSize, 8); } - public void testDefaults() { - ChunkingSettings settings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); - List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, Chunk.DEFAULT_NUM_CHUNKS).stream().map(String::trim).toList(); - - List result = process(PARAGRAPH_INPUT, null, null); - assertThat(result, equalTo(expected)); + public void testDefaultChunkSize() { + int numChunks = 1; // Default of 300 is huge, only one chunk returned in this case + verifyChunks(numChunks, null, numChunks); } - public void testDefaultNumChunks() { + public void testSpecifiedOptions() { + int numChunks = randomIntBetween(2, 4); int chunkSize = randomIntBetween(20, 30); - ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSize, 0); - List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, Chunk.DEFAULT_NUM_CHUNKS).stream().map(String::trim).toList(); - - List result = process(PARAGRAPH_INPUT, null, chunkSize); - assertThat(result, equalTo(expected)); + verifyChunks(numChunks, chunkSize, numChunks); } - public void testDefaultChunkSize() { - int numChunks = randomIntBetween(1, 3); - ChunkingSettings settings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); - List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, numChunks).stream().map(String::trim).toList(); + private void verifyChunks(Integer numChunks, Integer chunkSize, int expectedNumChunksReturned) { + int numChunksOrDefault = numChunks != null ? numChunks : Chunk.DEFAULT_NUM_CHUNKS; + int chunkSizeOrDefault = chunkSize != null ? chunkSize : Chunk.DEFAULT_CHUNK_SIZE; + ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSizeOrDefault, 0); + List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, numChunksOrDefault).stream().map(String::trim).toList(); - List result = process(PARAGRAPH_INPUT, numChunks, null); + List result = process(PARAGRAPH_INPUT, numChunksOrDefault, chunkSizeOrDefault); + assertThat(result.size(), equalTo(expectedNumChunksReturned)); assertThat(result, equalTo(expected)); } private List process(String str, Integer numChunks, Integer chunkSize) { + MapExpression optionsMap = (numChunks == null && chunkSize == null) ? null : createOptionsMap(numChunks, chunkSize); + try ( - EvalOperator.ExpressionEvaluator eval = evaluator( - new Chunk( - Source.EMPTY, - field("str", DataType.KEYWORD), - numChunks == null ? null : new Literal(Source.EMPTY, numChunks, DataType.INTEGER), - chunkSize == null ? null : new Literal(Source.EMPTY, chunkSize, DataType.INTEGER) - ) - ).get(driverContext()); + EvalOperator.ExpressionEvaluator eval = evaluator(new Chunk(Source.EMPTY, field("str", DataType.KEYWORD), optionsMap)).get( + driverContext() + ); Block block = eval.eval(row(List.of(new BytesRef(str)))) ) { if (block.isNull(0)) { From 2ae29894c07bfa59f8d8ce5a0178521c4e8120ec Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 24 Oct 2025 16:14:15 -0400 Subject: [PATCH 22/29] Cleanup --- .../expression/function/EsqlFunctionRegistry.java | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java index c138fa2418b26..c62560f330f15 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java @@ -1040,22 +1040,13 @@ public interface BinaryBuilder { @SuppressWarnings("overloads") // These are ambiguous if you aren't using ctor references but we always do protected static FunctionDefinition def(Class function, TernaryBuilder ctorRef, String... names) { FunctionBuilder builder = (source, children, cfg) -> { - boolean hasMinimumOne = TwoOptionalArguments.class.isAssignableFrom(function); boolean hasMinimumTwo = OptionalArgument.class.isAssignableFrom(function); - if (hasMinimumOne && (children.size() > 3 || children.isEmpty())) { - throw new QlIllegalArgumentException("expects minimum one, maximum three arguments"); - } else if (hasMinimumTwo && (children.size() > 3 || children.size() < 2)) { + if (hasMinimumTwo && (children.size() > 3 || children.size() < 2)) { throw new QlIllegalArgumentException("expects two or three arguments"); - } else if (hasMinimumOne == false && hasMinimumTwo == false && children.size() != 3) { + } else if (hasMinimumTwo == false && children.size() != 3) { throw new QlIllegalArgumentException("expects exactly three arguments"); } - - return ctorRef.build( - source, - children.get(0), - children.size() > 1 ? children.get(1) : null, - children.size() == 3 ? children.get(2) : null - ); + return ctorRef.build(source, children.get(0), children.get(1), children.size() == 3 ? children.get(2) : null); }; return def(function, builder, names); } From 114c0dabad0b01ef6cefc5a524d04688dafe0576 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 24 Oct 2025 16:18:18 -0400 Subject: [PATCH 23/29] Regenerate docs --- .../query-languages/esql/_snippets/functions/examples/chunk.md | 2 +- .../esql/_snippets/functions/parameters/chunk.md | 2 +- .../query-languages/esql/kibana/definition/functions/chunk.json | 2 +- .../query-languages/esql/kibana/docs/functions/chunk.md | 2 +- .../xpack/esql/expression/function/scalar/string/Chunk.java | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md index 2cc836bcd7178..4f875b1214fab 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md @@ -8,7 +8,7 @@ stack: preview 9.3.0 ```esql FROM books -| EVAL chunks = CHUNK(description, 1, 20) +| EVAL chunks = CHUNK(description, {"num_chunks":1, "chunk_size":20}) ``` | book_no:keyword | title:text | chunks:keyword | diff --git a/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md index ce86379c1254c..f287627d571ee 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md @@ -6,5 +6,5 @@ : The input to chunk. `options` -: TODO +: Options to customize chunking behavior. diff --git a/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json b/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json index f2d9af38d0e15..2be5b9665c320 100644 --- a/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json +++ b/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json @@ -30,7 +30,7 @@ } ], "examples" : [ - "FROM books\n| EVAL chunks = CHUNK(description, 1, 20)" + "FROM books\n| EVAL chunks = CHUNK(description, {\"num_chunks\":1, \"chunk_size\":20})" ], "preview" : true, "snapshot_only" : true diff --git a/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md b/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md index 2ecd49ef0b487..2af9e41799859 100644 --- a/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md +++ b/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md @@ -5,5 +5,5 @@ Use `CHUNK` to split a text field into smaller chunks. ```esql FROM books -| EVAL chunks = CHUNK(description, 1, 20) +| EVAL chunks = CHUNK(description, {"num_chunks":1, "chunk_size":20}) ``` diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 03a08c8261c05..c2c853476c68f 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -79,7 +79,7 @@ public Chunk( type = "integer", description = "The size of sentence-based chunks to use. Defaults to " + DEFAULT_CHUNK_SIZE ), }, - description = "TODO", + description = "Options to customize chunking behavior.", optional = true ) Expression options ) { From 45c517a5e4ce6e74858a701ef61e8b3de8da2b3a Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 24 Oct 2025 16:30:45 -0400 Subject: [PATCH 24/29] Add test on a concatenated field --- .../src/main/resources/chunk.csv-spec | 19 +++++++++++++++++++ .../function/scalar/string/Chunk.java | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index 96ff691708faa..8cfbc0d6f6017 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -89,6 +89,25 @@ book_no:keyword | title:text 7350 | Return of the Shadow | first volume) Christopher Tolkien describes, with full citation of the earliest notes, outline plans, and narrative drafts, the intricate evolution ; +chunkTextWithConcatenatedField +required_capability: chunk_function + +FROM books +| EVAL title_description = CONCAT(title, description) +| EVAL chunks = CHUNK(title_description, {"num_chunks":1, "chunk_size":20}) +| KEEP book_no, title, chunks +| SORT book_no +| LIMIT 5 +; + +book_no:keyword | title:text | chunks:keyword +1211 | The brothers Karamazov | The brothers KaramazovIn 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his +1463 | Realms of Tolkien: Images of Middle-earth | Realms of Tolkien: Images of Middle-earthTwenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an +1502 | Selected Passages from Correspondence with Friends | Selected Passages from Correspondence with FriendsNikolai Gogol wrote some letters to his friends, none of which were a nose of +1937 | The Best Short Stories of Dostoevsky (Modern Library) | The Best Short Stories of Dostoevsky (Modern Library)This collection, unique to the Modern Library, gathers seven of Dostoevsky's key +1985 | Brothers Karamazov | Brothers KaramazovFour brothers reunite in their hometown in Russia. +; + diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index c2c853476c68f..cd00da6e1e736 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -201,7 +201,7 @@ public static List chunkText(String content, ChunkingSettings chunkingSe return chunker.chunk(content, chunkingSettings) .stream() .map(offset -> content.substring(offset.start(), offset.end())) - .limit(numChunks > 0 ? numChunks : Long.MAX_VALUE) + .limit(numChunks > 0 ? numChunks : DEFAULT_NUM_CHUNKS) .toList(); } From 7808118824b6c04ff3595dd4c737e87f5387c2ee Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Mon, 27 Oct 2025 09:30:11 -0400 Subject: [PATCH 25/29] Add multivalued field test --- .../src/main/resources/chunk.csv-spec | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index 8cfbc0d6f6017..1bf73acb2999b 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -108,6 +108,26 @@ book_no:keyword | title:text | chunks 1985 | Brothers Karamazov | Brothers KaramazovFour brothers reunite in their hometown in Russia. ; +chunkTextWithMultivaluedField +required_capability: chunk_function + +FROM employees +| EVAL chunks = CHUNK(job_positions) +| KEEP emp_no, first_name, last_name, chunks +| SORT emp_no +| LIMIT 5 +; +warning:Line 2:17: evaluation of [CHUNK(job_positions)] failed, treating result as null. Only first 20 failures recorded. +warning:Line 2:17: java.lang.IllegalArgumentException: single-value function encountered multi-value + +emp_no:integer | first_name:keyword | last_name:keyword | chunks:keyword +10001 | Georgi | Facello | null +10002 | Bezalel | Simmel | Senior Team Lead +10003 | Parto | Bamford | null +10004 | Chirstian | Koblick | null +10005 | Kyoichi | Maliniak | null +; + From f0a23f06382760e1117c60d9915a2b4c098b6c56 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Tue, 28 Oct 2025 14:03:43 -0400 Subject: [PATCH 26/29] Don't hardcode strings --- .../esql/expression/function/scalar/string/Chunk.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index cd00da6e1e736..ee732d67eb826 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -35,6 +35,7 @@ import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Objects; @@ -197,7 +198,6 @@ static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, public static List chunkText(String content, ChunkingSettings chunkingSettings, int numChunks) { Chunker chunker = ChunkerBuilder.fromChunkingStrategy(chunkingSettings.getChunkingStrategy()); - return chunker.chunk(content, chunkingSettings) .stream() .map(offset -> content.substring(offset.start(), offset.end())) @@ -225,11 +225,11 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua return new ChunkBytesRefEvaluator.Factory( source(), toEvaluator.apply(field), - optionsMap != null && optionsMap.containsKey("num_chunks") - ? toEvaluator.apply(optionsMap.get("num_chunks")) + optionsMap != null && optionsMap.containsKey(NUM_CHUNKS) + ? toEvaluator.apply(optionsMap.get(NUM_CHUNKS)) : toEvaluator.apply(new Literal(source(), DEFAULT_NUM_CHUNKS, DataType.INTEGER)), - optionsMap != null && optionsMap.containsKey("chunk_size") - ? toEvaluator.apply(optionsMap.get("chunk_size")) + optionsMap != null && optionsMap.containsKey(CHUNK_SIZE) + ? toEvaluator.apply(optionsMap.get(CHUNK_SIZE)) : toEvaluator.apply(new Literal(source(), DEFAULT_CHUNK_SIZE, DataType.INTEGER)) ); } From 6a1da0e23baa99356cf182e04eb0666d52a025d0 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Tue, 28 Oct 2025 18:28:21 +0000 Subject: [PATCH 27/29] [CI] Auto commit changes from spotless --- .../xpack/esql/expression/function/scalar/string/Chunk.java | 1 - 1 file changed, 1 deletion(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index ee732d67eb826..067fe9f4c200b 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -35,7 +35,6 @@ import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import java.io.IOException; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Objects; From ee4d532f40a55b9ce6777beb430ebcd407c7be36 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Wed, 29 Oct 2025 14:44:42 -0400 Subject: [PATCH 28/29] PR feedback --- .../function/scalar/string/Chunk.java | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 067fe9f4c200b..c11063616b88d 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -35,6 +35,7 @@ import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import java.io.IOException; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -219,17 +220,19 @@ public int hashCode() { @Override public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { - MapExpression optionsMap = options() != null ? (MapExpression) options() : null; + Map optionsMap = new HashMap<>(); + if (options() != null) { + Options.populateMap(((MapExpression) options), optionsMap, source(), SECOND, ALLOWED_OPTIONS); + } + + int numChunks = (Integer) optionsMap.getOrDefault(NUM_CHUNKS, DEFAULT_NUM_CHUNKS); + int chunkSize = (Integer) optionsMap.getOrDefault(CHUNK_SIZE, DEFAULT_CHUNK_SIZE); return new ChunkBytesRefEvaluator.Factory( source(), toEvaluator.apply(field), - optionsMap != null && optionsMap.containsKey(NUM_CHUNKS) - ? toEvaluator.apply(optionsMap.get(NUM_CHUNKS)) - : toEvaluator.apply(new Literal(source(), DEFAULT_NUM_CHUNKS, DataType.INTEGER)), - optionsMap != null && optionsMap.containsKey(CHUNK_SIZE) - ? toEvaluator.apply(optionsMap.get(CHUNK_SIZE)) - : toEvaluator.apply(new Literal(source(), DEFAULT_CHUNK_SIZE, DataType.INTEGER)) + toEvaluator.apply(new Literal(source(), numChunks, DataType.INTEGER)), + toEvaluator.apply(new Literal(source(), chunkSize, DataType.INTEGER)) ); } } From d12852c3af037144d8c9f96f94fda911c3679da1 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 29 Oct 2025 20:28:27 +0000 Subject: [PATCH 29/29] [CI] Auto commit changes from spotless --- .../org/elasticsearch/xpack/esql/action/EsqlCapabilities.java | 1 - 1 file changed, 1 deletion(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 9d58e6cd50cc3..7097a3f256a2b 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -1619,7 +1619,6 @@ public enum Cap { */ FIX_REPLACE_ALIASING_EVAL_WITH_PROJECT_SHADOWING, - /** * Chunk function. */