From 02231c34a3e3029b6f81465ab57c999070428bc3 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 14 Nov 2025 08:54:16 -0500 Subject: [PATCH 01/15] Stash claude changes --- .../function/scalar/string/Chunk.java | 122 +++++++++++++----- 1 file changed, 90 insertions(+), 32 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index c11063616b88d..45bb1edacd412 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -19,6 +19,7 @@ import org.elasticsearch.xpack.core.inference.chunking.ChunkerBuilder; import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; import org.elasticsearch.xpack.esql.core.InvalidArgumentException; +import org.elasticsearch.xpack.esql.core.expression.EntryExpression; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Literal; import org.elasticsearch.xpack.esql.core.expression.MapExpression; @@ -49,14 +50,15 @@ public class Chunk extends EsqlScalarFunction implements OptionalArgument { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Chunk", Chunk::new); public static final int DEFAULT_NUM_CHUNKS = Integer.MAX_VALUE; - public static final int DEFAULT_CHUNK_SIZE = 300; + static final int DEFAULT_CHUNK_SIZE = 300; + public static final ChunkingSettings DEFAULT_CHUNKING_SETTINGS = new SentenceBoundaryChunkingSettings(DEFAULT_CHUNK_SIZE, 0); private final Expression field, options; static final String NUM_CHUNKS = "num_chunks"; - static final String CHUNK_SIZE = "chunk_size"; + static final String CHUNKING_SETTINGS = "chunking_settings"; - public static final Map ALLOWED_OPTIONS = Map.of(NUM_CHUNKS, DataType.INTEGER, CHUNK_SIZE, DataType.INTEGER); + public static final Map ALLOWED_OPTIONS = Map.of(NUM_CHUNKS, DataType.INTEGER, CHUNKING_SETTINGS, DataType.OBJECT); @FunctionInfo(returnType = "keyword", preview = true, description = """ Use `CHUNK` to split a text field into smaller chunks.""", detailedDescription = """ @@ -76,9 +78,10 @@ public Chunk( description = "The number of chunks to return. Defaults to return all chunks." ), @MapParam.MapParamEntry( - name = "chunk_size", - type = "integer", - description = "The size of sentence-based chunks to use. Defaults to " + DEFAULT_CHUNK_SIZE + name = "chunking_settings", + type = "object", + description = "The chunking settings with which to apply to the field. " + + "If no chunking settings are specified, defaults to sentence-based chunks of size " + DEFAULT_CHUNK_SIZE ), }, description = "Options to customize chunking behavior.", optional = true @@ -89,17 +92,6 @@ public Chunk( this.options = options; } - private Chunk( - Source source, - Expression field, - Expression options, - boolean unused // dummy parameter to differentiate constructors - ) { - super(source, options == null ? List.of(field) : List.of(field, options)); - this.field = field; - this.options = options; - } - public Chunk(StreamInput in) throws IOException { this( Source.readFrom((PlanStreamInput) in), @@ -130,23 +122,71 @@ protected TypeResolution resolveType() { if (childrenResolved() == false) { return new TypeResolution("Unresolved children"); } - return isString(field(), sourceText(), FIRST).and(Options.resolve(options, source(), SECOND, ALLOWED_OPTIONS, this::verifyOptions)); - } - private void verifyOptions(Map optionsMap) { + TypeResolution fieldResolution = isString(field(), sourceText(), FIRST); + if (fieldResolution.unresolved()) { + return fieldResolution; + } + if (options == null) { - return; + return TypeResolution.TYPE_RESOLVED; } - Integer numChunks = (Integer) optionsMap.get(NUM_CHUNKS); - if (numChunks != null && numChunks < 0) { - throw new InvalidArgumentException("[{}] cannot be negative, found [{}]", NUM_CHUNKS, numChunks); + // Custom validation for options since we need to handle nested MapExpression for chunking_settings + if (options instanceof MapExpression == false) { + return new TypeResolution("second argument of [" + sourceText() + "] must be a map"); } - Integer chunkSize = (Integer) optionsMap.get(CHUNK_SIZE); - if (chunkSize != null && chunkSize < 0) { - throw new InvalidArgumentException("[{}] cannot be negative, found [{}]", CHUNK_SIZE, chunkSize); + + MapExpression mapExpr = (MapExpression) options; + for (EntryExpression entry : mapExpr.entryExpressions()) { + if (entry.key() instanceof Literal == false || ((Literal) entry.key()).foldable() == false) { + return new TypeResolution("option names must be constants in [" + sourceText() + "]"); + } + + Object keyValue = ((Literal) entry.key()).value(); + String optionName = keyValue instanceof BytesRef br ? br.utf8ToString() : keyValue.toString(); + + if (NUM_CHUNKS.equals(optionName)) { + if (entry.value() instanceof Literal == false) { + return new TypeResolution("[" + NUM_CHUNKS + "] must be a constant"); + } + Literal value = (Literal) entry.value(); + if (value.dataType() != DataType.INTEGER) { + return new TypeResolution("[" + NUM_CHUNKS + "] must be an integer, found [" + value.dataType() + "]"); + } + Integer numChunks = (Integer) value.value(); + if (numChunks != null && numChunks < 0) { + return new TypeResolution("[" + NUM_CHUNKS + "] cannot be negative, found [" + numChunks + "]"); + } + } else if (CHUNKING_SETTINGS.equals(optionName)) { + if (entry.value() instanceof MapExpression == false) { + return new TypeResolution("[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().getClass().getSimpleName() + "]"); + } + // Validate the nested map has valid keys/values + TypeResolution chunkingSettingsResolution = validateChunkingSettings((MapExpression) entry.value()); + if (chunkingSettingsResolution.unresolved()) { + return chunkingSettingsResolution; + } + } else { + return new TypeResolution("Invalid option [" + optionName + "], expected one of [" + String.join(", ", ALLOWED_OPTIONS.keySet()) + "]"); + } } + return TypeResolution.TYPE_RESOLVED; + } + + private TypeResolution validateChunkingSettings(MapExpression chunkingSettingsMap) { + // Basic validation - just ensure all keys are literals and all values are literals + // The actual validation will be done by ChunkingSettingsBuilder.fromMap() at evaluation time + for (EntryExpression entry : chunkingSettingsMap.entryExpressions()) { + if (entry.key() instanceof Literal == false || ((Literal) entry.key()).foldable() == false) { + return new TypeResolution("chunking_settings keys must be constants"); + } + if (entry.value() instanceof Literal == false || ((Literal) entry.value()).foldable() == false) { + return new TypeResolution("chunking_settings values must be constants"); + } + } + return TypeResolution.TYPE_RESOLVED; } @Override @@ -219,15 +259,33 @@ public int hashCode() { @Override public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { + int numChunks = DEFAULT_NUM_CHUNKS; + int chunkSize = DEFAULT_CHUNK_SIZE; - Map optionsMap = new HashMap<>(); if (options() != null) { - Options.populateMap(((MapExpression) options), optionsMap, source(), SECOND, ALLOWED_OPTIONS); + MapExpression mapExpr = (MapExpression) options(); + for (EntryExpression entry : mapExpr.entryExpressions()) { + Object keyValue = ((Literal) entry.key()).value(); + String optionName = keyValue instanceof BytesRef br ? br.utf8ToString() : keyValue.toString(); + + if (NUM_CHUNKS.equals(optionName)) { + numChunks = (Integer) ((Literal) entry.value()).value(); + } else if (CHUNKING_SETTINGS.equals(optionName)) { + // For now, we'll just extract max_chunk_size from the nested map + // In the future, this should fully support ChunkingSettings + MapExpression chunkingSettingsExpr = (MapExpression) entry.value(); + for (EntryExpression csEntry : chunkingSettingsExpr.entryExpressions()) { + Object csKeyValue = ((Literal) csEntry.key()).value(); + String csKey = csKeyValue instanceof BytesRef br ? br.utf8ToString() : csKeyValue.toString(); + if ("max_chunk_size".equals(csKey)) { + chunkSize = (Integer) ((Literal) csEntry.value()).value(); + break; + } + } + } + } } - int numChunks = (Integer) optionsMap.getOrDefault(NUM_CHUNKS, DEFAULT_NUM_CHUNKS); - int chunkSize = (Integer) optionsMap.getOrDefault(CHUNK_SIZE, DEFAULT_CHUNK_SIZE); - return new ChunkBytesRefEvaluator.Factory( source(), toEvaluator.apply(field), From 5cc0a5d2225a91029d59820db30465f2038ed401 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 14 Nov 2025 11:18:16 -0500 Subject: [PATCH 02/15] Update --- .../function/scalar/string/Chunk.java | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 45bb1edacd412..42ac33d01a093 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -17,8 +17,8 @@ import org.elasticsearch.inference.ChunkingSettings; import org.elasticsearch.xpack.core.inference.chunking.Chunker; import org.elasticsearch.xpack.core.inference.chunking.ChunkerBuilder; +import org.elasticsearch.xpack.core.inference.chunking.ChunkingSettingsBuilder; import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; -import org.elasticsearch.xpack.esql.core.InvalidArgumentException; import org.elasticsearch.xpack.esql.core.expression.EntryExpression; import org.elasticsearch.xpack.esql.core.expression.Expression; import org.elasticsearch.xpack.esql.core.expression.Literal; @@ -30,19 +30,16 @@ import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.MapParam; import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; -import org.elasticsearch.xpack.esql.expression.function.Options; import org.elasticsearch.xpack.esql.expression.function.Param; import org.elasticsearch.xpack.esql.expression.function.scalar.EsqlScalarFunction; import org.elasticsearch.xpack.esql.io.stream.PlanStreamInput; import java.io.IOException; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Objects; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST; -import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.SECOND; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; public class Chunk extends EsqlScalarFunction implements OptionalArgument { @@ -160,7 +157,8 @@ protected TypeResolution resolveType() { } } else if (CHUNKING_SETTINGS.equals(optionName)) { if (entry.value() instanceof MapExpression == false) { - return new TypeResolution("[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().getClass().getSimpleName() + "]"); + return new TypeResolution( + "[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().getClass().getSimpleName() + "]"); } // Validate the nested map has valid keys/values TypeResolution chunkingSettingsResolution = validateChunkingSettings((MapExpression) entry.value()); @@ -168,7 +166,8 @@ protected TypeResolution resolveType() { return chunkingSettingsResolution; } } else { - return new TypeResolution("Invalid option [" + optionName + "], expected one of [" + String.join(", ", ALLOWED_OPTIONS.keySet()) + "]"); + return new TypeResolution( + "Invalid option [" + optionName + "], expected one of [" + String.join(", ", ALLOWED_OPTIONS.keySet()) + "]"); } } @@ -217,11 +216,11 @@ Expression options() { } @Evaluator(extraName = "BytesRef") - static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, int chunkSize) { + static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, Map chunkingSettingsMap) { String content = str.utf8ToString(); - ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSize, 0); - List chunks = chunkText(content, settings, numChunks); + ChunkingSettings chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); + List chunks = chunkText(content, chunkingSettings, numChunks); boolean multivalued = chunks.size() > 1; if (multivalued) { From deaddf80d824a9506d33574c60ebda411a279261 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 14 Nov 2025 11:22:31 -0500 Subject: [PATCH 03/15] test --- .../function/scalar/string/ChunkTests.java | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java index 21592b5b95424..ab6802d51a8a7 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java @@ -31,7 +31,7 @@ import java.util.stream.IntStream; import static org.elasticsearch.compute.data.BlockUtils.toJavaObject; -import static org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk.CHUNK_SIZE; +import static org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk.CHUNKING_SETTINGS; import static org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk.NUM_CHUNKS; import static org.hamcrest.Matchers.equalTo; @@ -102,7 +102,7 @@ public static Iterable parameters() { ); } - private static MapExpression createOptionsMap(Integer numChunks, Integer chunkSize) { + private static MapExpression createOptionsMap(Integer numChunks, ChunkingSettings chunkingSettings) { List keyValuePairs = new ArrayList<>(); if (Objects.nonNull(numChunks)) { @@ -110,9 +110,9 @@ private static MapExpression createOptionsMap(Integer numChunks, Integer chunkSi keyValuePairs.add(new Literal(Source.EMPTY, numChunks, DataType.INTEGER)); } - if (Objects.nonNull(chunkSize)) { - keyValuePairs.add(Literal.keyword(Source.EMPTY, CHUNK_SIZE)); - keyValuePairs.add(new Literal(Source.EMPTY, chunkSize, DataType.INTEGER)); + if (Objects.nonNull(chunkingSettings)) { + keyValuePairs.add(Literal.keyword(Source.EMPTY, CHUNKING_SETTINGS)); + keyValuePairs.add(new Literal(Source.EMPTY, chunkingSettings.asMap(), DataType.INTEGER)); } return new MapExpression(Source.EMPTY, keyValuePairs); @@ -156,13 +156,14 @@ private void verifyChunks(Integer numChunks, Integer chunkSize, int expectedNumC ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSizeOrDefault, 0); List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, numChunksOrDefault).stream().map(String::trim).toList(); - List result = process(PARAGRAPH_INPUT, numChunksOrDefault, chunkSizeOrDefault); + List result = process(PARAGRAPH_INPUT, numChunksOrDefault, settings); assertThat(result.size(), equalTo(expectedNumChunksReturned)); assertThat(result, equalTo(expected)); } - private List process(String str, Integer numChunks, Integer chunkSize) { - MapExpression optionsMap = (numChunks == null && chunkSize == null) ? null : createOptionsMap(numChunks, chunkSize); + private List process(String str, Integer numChunks, ChunkingSettings chunkingSettings) { + MapExpression optionsMap = (numChunks == null && chunkingSettings == null) ? null : + createOptionsMap(numChunks, chunkingSettings); try ( EvalOperator.ExpressionEvaluator eval = evaluator(new Chunk(Source.EMPTY, field("str", DataType.KEYWORD), optionsMap)).get( From fd8ef3deb906e8f3763093bda8c899d6345b7a90 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 14 Nov 2025 11:36:22 -0500 Subject: [PATCH 04/15] fix --- .../scalar/string/ChunkBytesRefEvaluator.java | 69 +++++++------------ .../function/scalar/string/Chunk.java | 38 ++++++---- 2 files changed, 47 insertions(+), 60 deletions(-) diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java index ed3e581175987..15f26616fc715 100644 --- a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java +++ b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java @@ -19,6 +19,7 @@ import org.elasticsearch.compute.operator.EvalOperator; import org.elasticsearch.compute.operator.Warnings; import org.elasticsearch.core.Releasables; +import org.elasticsearch.inference.ChunkingSettings; import org.elasticsearch.xpack.esql.core.tree.Source; /** @@ -34,19 +35,19 @@ public final class ChunkBytesRefEvaluator implements EvalOperator.ExpressionEval private final EvalOperator.ExpressionEvaluator numChunks; - private final EvalOperator.ExpressionEvaluator chunkSize; + private final ChunkingSettings chunkingSettings; private final DriverContext driverContext; private Warnings warnings; public ChunkBytesRefEvaluator(Source source, EvalOperator.ExpressionEvaluator str, - EvalOperator.ExpressionEvaluator numChunks, EvalOperator.ExpressionEvaluator chunkSize, + EvalOperator.ExpressionEvaluator numChunks, ChunkingSettings chunkingSettings, DriverContext driverContext) { this.source = source; this.str = str; this.numChunks = numChunks; - this.chunkSize = chunkSize; + this.chunkingSettings = chunkingSettings; this.driverContext = driverContext; } @@ -54,21 +55,15 @@ public ChunkBytesRefEvaluator(Source source, EvalOperator.ExpressionEvaluator st public Block eval(Page page) { try (BytesRefBlock strBlock = (BytesRefBlock) str.eval(page)) { try (IntBlock numChunksBlock = (IntBlock) numChunks.eval(page)) { - try (IntBlock chunkSizeBlock = (IntBlock) chunkSize.eval(page)) { - BytesRefVector strVector = strBlock.asVector(); - if (strVector == null) { - return eval(page.getPositionCount(), strBlock, numChunksBlock, chunkSizeBlock); - } - IntVector numChunksVector = numChunksBlock.asVector(); - if (numChunksVector == null) { - return eval(page.getPositionCount(), strBlock, numChunksBlock, chunkSizeBlock); - } - IntVector chunkSizeVector = chunkSizeBlock.asVector(); - if (chunkSizeVector == null) { - return eval(page.getPositionCount(), strBlock, numChunksBlock, chunkSizeBlock); - } - return eval(page.getPositionCount(), strVector, numChunksVector, chunkSizeVector); + BytesRefVector strVector = strBlock.asVector(); + if (strVector == null) { + return eval(page.getPositionCount(), strBlock, numChunksBlock); } + IntVector numChunksVector = numChunksBlock.asVector(); + if (numChunksVector == null) { + return eval(page.getPositionCount(), strBlock, numChunksBlock); + } + return eval(page.getPositionCount(), strVector, numChunksVector); } } } @@ -78,12 +73,10 @@ public long baseRamBytesUsed() { long baseRamBytesUsed = BASE_RAM_BYTES_USED; baseRamBytesUsed += str.baseRamBytesUsed(); baseRamBytesUsed += numChunks.baseRamBytesUsed(); - baseRamBytesUsed += chunkSize.baseRamBytesUsed(); return baseRamBytesUsed; } - public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock numChunksBlock, - IntBlock chunkSizeBlock) { + public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock numChunksBlock) { try(BytesRefBlock.Builder result = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { BytesRef strScratch = new BytesRef(); position: for (int p = 0; p < positionCount; p++) { @@ -109,35 +102,22 @@ public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock nu result.appendNull(); continue position; } - switch (chunkSizeBlock.getValueCount(p)) { - case 0: - result.appendNull(); - continue position; - case 1: - break; - default: - warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); - result.appendNull(); - continue position; - } BytesRef str = strBlock.getBytesRef(strBlock.getFirstValueIndex(p), strScratch); int numChunks = numChunksBlock.getInt(numChunksBlock.getFirstValueIndex(p)); - int chunkSize = chunkSizeBlock.getInt(chunkSizeBlock.getFirstValueIndex(p)); - Chunk.process(result, str, numChunks, chunkSize); + Chunk.process(result, str, numChunks, this.chunkingSettings); } return result.build(); } } - public BytesRefBlock eval(int positionCount, BytesRefVector strVector, IntVector numChunksVector, - IntVector chunkSizeVector) { + public BytesRefBlock eval(int positionCount, BytesRefVector strVector, + IntVector numChunksVector) { try(BytesRefBlock.Builder result = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { BytesRef strScratch = new BytesRef(); position: for (int p = 0; p < positionCount; p++) { BytesRef str = strVector.getBytesRef(p, strScratch); int numChunks = numChunksVector.getInt(p); - int chunkSize = chunkSizeVector.getInt(p); - Chunk.process(result, str, numChunks, chunkSize); + Chunk.process(result, str, numChunks, this.chunkingSettings); } return result.build(); } @@ -145,12 +125,12 @@ public BytesRefBlock eval(int positionCount, BytesRefVector strVector, IntVector @Override public String toString() { - return "ChunkBytesRefEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkSize=" + chunkSize + "]"; + return "ChunkBytesRefEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkingSettings=" + chunkingSettings + "]"; } @Override public void close() { - Releasables.closeExpectNoException(str, numChunks, chunkSize); + Releasables.closeExpectNoException(str, numChunks); } private Warnings warnings() { @@ -172,25 +152,24 @@ static class Factory implements EvalOperator.ExpressionEvaluator.Factory { private final EvalOperator.ExpressionEvaluator.Factory numChunks; - private final EvalOperator.ExpressionEvaluator.Factory chunkSize; + private final ChunkingSettings chunkingSettings; public Factory(Source source, EvalOperator.ExpressionEvaluator.Factory str, - EvalOperator.ExpressionEvaluator.Factory numChunks, - EvalOperator.ExpressionEvaluator.Factory chunkSize) { + EvalOperator.ExpressionEvaluator.Factory numChunks, ChunkingSettings chunkingSettings) { this.source = source; this.str = str; this.numChunks = numChunks; - this.chunkSize = chunkSize; + this.chunkingSettings = chunkingSettings; } @Override public ChunkBytesRefEvaluator get(DriverContext context) { - return new ChunkBytesRefEvaluator(source, str.get(context), numChunks.get(context), chunkSize.get(context), context); + return new ChunkBytesRefEvaluator(source, str.get(context), numChunks.get(context), chunkingSettings, context); } @Override public String toString() { - return "ChunkBytesRefEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkSize=" + chunkSize + "]"; + return "ChunkBytesRefEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkingSettings=" + chunkingSettings + "]"; } } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 42ac33d01a093..cddcf9fb6f4c1 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -12,6 +12,7 @@ import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.compute.ann.Evaluator; +import org.elasticsearch.compute.ann.Fixed; import org.elasticsearch.compute.data.BytesRefBlock; import org.elasticsearch.compute.operator.EvalOperator; import org.elasticsearch.inference.ChunkingSettings; @@ -216,10 +217,9 @@ Expression options() { } @Evaluator(extraName = "BytesRef") - static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, Map chunkingSettingsMap) { + static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, @Fixed ChunkingSettings chunkingSettings) { String content = str.utf8ToString(); - ChunkingSettings chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); List chunks = chunkText(content, chunkingSettings, numChunks); boolean multivalued = chunks.size() > 1; @@ -259,7 +259,7 @@ public int hashCode() { @Override public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { int numChunks = DEFAULT_NUM_CHUNKS; - int chunkSize = DEFAULT_CHUNK_SIZE; + ChunkingSettings chunkingSettings = DEFAULT_CHUNKING_SETTINGS; if (options() != null) { MapExpression mapExpr = (MapExpression) options(); @@ -270,17 +270,9 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua if (NUM_CHUNKS.equals(optionName)) { numChunks = (Integer) ((Literal) entry.value()).value(); } else if (CHUNKING_SETTINGS.equals(optionName)) { - // For now, we'll just extract max_chunk_size from the nested map - // In the future, this should fully support ChunkingSettings - MapExpression chunkingSettingsExpr = (MapExpression) entry.value(); - for (EntryExpression csEntry : chunkingSettingsExpr.entryExpressions()) { - Object csKeyValue = ((Literal) csEntry.key()).value(); - String csKey = csKeyValue instanceof BytesRef br ? br.utf8ToString() : csKeyValue.toString(); - if ("max_chunk_size".equals(csKey)) { - chunkSize = (Integer) ((Literal) csEntry.value()).value(); - break; - } - } + // Convert the nested MapExpression to Map and build ChunkingSettings + Map chunkingSettingsMap = mapExpressionToMap((MapExpression) entry.value()); + chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); } } } @@ -289,7 +281,23 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua source(), toEvaluator.apply(field), toEvaluator.apply(new Literal(source(), numChunks, DataType.INTEGER)), - toEvaluator.apply(new Literal(source(), chunkSize, DataType.INTEGER)) + chunkingSettings ); } + + private static Map mapExpressionToMap(MapExpression mapExpr) { + Map result = new java.util.HashMap<>(); + for (EntryExpression entry : mapExpr.entryExpressions()) { + Object keyValue = ((Literal) entry.key()).value(); + String key = keyValue instanceof BytesRef br ? br.utf8ToString() : keyValue.toString(); + + Object value = ((Literal) entry.value()).value(); + // Convert BytesRef to String for proper handling by ChunkingSettingsBuilder + if (value instanceof BytesRef br) { + value = br.utf8ToString(); + } + result.put(key, value); + } + return result; + } } From f1e2cf6ad716866c01c20bc5708fcc7670b466ea Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 14 Nov 2025 16:11:54 -0500 Subject: [PATCH 05/15] iter --- .../expression/function/scalar/string/Chunk.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index cddcf9fb6f4c1..a840827bafc55 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -130,14 +130,14 @@ protected TypeResolution resolveType() { return TypeResolution.TYPE_RESOLVED; } - // Custom validation for options since we need to handle nested MapExpression for chunking_settings + // TODO - Options#resolve should play nicely with nested MapExpressions, doing a hacky manual evaluation for now if (options instanceof MapExpression == false) { return new TypeResolution("second argument of [" + sourceText() + "] must be a map"); } MapExpression mapExpr = (MapExpression) options; for (EntryExpression entry : mapExpr.entryExpressions()) { - if (entry.key() instanceof Literal == false || ((Literal) entry.key()).foldable() == false) { + if (entry.key() instanceof Literal == false || entry.key().foldable() == false) { return new TypeResolution("option names must be constants in [" + sourceText() + "]"); } @@ -161,7 +161,6 @@ protected TypeResolution resolveType() { return new TypeResolution( "[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().getClass().getSimpleName() + "]"); } - // Validate the nested map has valid keys/values TypeResolution chunkingSettingsResolution = validateChunkingSettings((MapExpression) entry.value()); if (chunkingSettingsResolution.unresolved()) { return chunkingSettingsResolution; @@ -176,13 +175,12 @@ protected TypeResolution resolveType() { } private TypeResolution validateChunkingSettings(MapExpression chunkingSettingsMap) { - // Basic validation - just ensure all keys are literals and all values are literals - // The actual validation will be done by ChunkingSettingsBuilder.fromMap() at evaluation time + // Just ensure all keys and values are literals - defer valid chunking settings for validation later for (EntryExpression entry : chunkingSettingsMap.entryExpressions()) { - if (entry.key() instanceof Literal == false || ((Literal) entry.key()).foldable() == false) { + if (entry.key() instanceof Literal == false || (entry.key()).foldable() == false) { return new TypeResolution("chunking_settings keys must be constants"); } - if (entry.value() instanceof Literal == false || ((Literal) entry.value()).foldable() == false) { + if (entry.value() instanceof Literal == false || (entry.value()).foldable() == false) { return new TypeResolution("chunking_settings values must be constants"); } } From 67f78f340da887e0263de74d1b85663786833d21 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 14 Nov 2025 16:23:15 -0500 Subject: [PATCH 06/15] iter --- .../expression/function/scalar/string/Chunk.java | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index a840827bafc55..afce6ef49e614 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -159,12 +159,9 @@ protected TypeResolution resolveType() { } else if (CHUNKING_SETTINGS.equals(optionName)) { if (entry.value() instanceof MapExpression == false) { return new TypeResolution( - "[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().getClass().getSimpleName() + "]"); - } - TypeResolution chunkingSettingsResolution = validateChunkingSettings((MapExpression) entry.value()); - if (chunkingSettingsResolution.unresolved()) { - return chunkingSettingsResolution; + "[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().dataType() + "]"); } + return validateChunkingSettings(entry.value()); } else { return new TypeResolution( "Invalid option [" + optionName + "], expected one of [" + String.join(", ", ALLOWED_OPTIONS.keySet()) + "]"); @@ -174,8 +171,10 @@ protected TypeResolution resolveType() { return TypeResolution.TYPE_RESOLVED; } - private TypeResolution validateChunkingSettings(MapExpression chunkingSettingsMap) { + private TypeResolution validateChunkingSettings(Expression chunkingSettings) { // Just ensure all keys and values are literals - defer valid chunking settings for validation later + assert chunkingSettings instanceof MapExpression; + MapExpression chunkingSettingsMap = (MapExpression) chunkingSettings; for (EntryExpression entry : chunkingSettingsMap.entryExpressions()) { if (entry.key() instanceof Literal == false || (entry.key()).foldable() == false) { return new TypeResolution("chunking_settings keys must be constants"); @@ -269,7 +268,7 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua numChunks = (Integer) ((Literal) entry.value()).value(); } else if (CHUNKING_SETTINGS.equals(optionName)) { // Convert the nested MapExpression to Map and build ChunkingSettings - Map chunkingSettingsMap = mapExpressionToMap((MapExpression) entry.value()); + Map chunkingSettingsMap = toMap((MapExpression) entry.value()); chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); } } @@ -283,14 +282,13 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua ); } - private static Map mapExpressionToMap(MapExpression mapExpr) { + private static Map toMap(MapExpression mapExpr) { Map result = new java.util.HashMap<>(); for (EntryExpression entry : mapExpr.entryExpressions()) { Object keyValue = ((Literal) entry.key()).value(); String key = keyValue instanceof BytesRef br ? br.utf8ToString() : keyValue.toString(); Object value = ((Literal) entry.value()).value(); - // Convert BytesRef to String for proper handling by ChunkingSettingsBuilder if (value instanceof BytesRef br) { value = br.utf8ToString(); } From abbc7b3f8368ff0ba6a8356762c1b5bae6c025c0 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Fri, 14 Nov 2025 16:44:22 -0500 Subject: [PATCH 07/15] tests --- .../functions/functionNamedParams/chunk.md | 6 +- .../function/scalar/string/Chunk.java | 6 +- .../function/scalar/string/ChunkTests.java | 58 ++++++++++--------- 3 files changed, 36 insertions(+), 34 deletions(-) diff --git a/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md index 265551c8bee8a..96c46bf834b8c 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md @@ -2,9 +2,9 @@ **Supported function named parameters** +`chunking_settings` +: (object) The chunking settings with which to apply to the field. If no chunking settings are specified, defaults to sentence-based chunks of size 300 + `num_chunks` : (integer) The number of chunks to return. Defaults to return all chunks. -`chunk_size` -: (integer) The size of sentence-based chunks to use. Defaults to 300 - diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index afce6ef49e614..bb03c9a2a4c53 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -126,10 +126,10 @@ protected TypeResolution resolveType() { return fieldResolution; } - if (options == null) { - return TypeResolution.TYPE_RESOLVED; - } + return options == null ? TypeResolution.TYPE_RESOLVED : validateOptions(); + } + private TypeResolution validateOptions() { // TODO - Options#resolve should play nicely with nested MapExpressions, doing a hacky manual evaluation for now if (options instanceof MapExpression == false) { return new TypeResolution("second argument of [" + sourceText() + "] must be a map"); diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java index ab6802d51a8a7..6bdd2358f6e5a 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java @@ -73,9 +73,7 @@ public static Iterable parameters() { List.of(new TestCaseSupplier.TypedData(new BytesRef(text), DataType.KEYWORD, "str")), "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=LiteralsEvaluator[lit=" + Chunk.DEFAULT_NUM_CHUNKS - + "], chunkSize=LiteralsEvaluator[lit=" - + Chunk.DEFAULT_CHUNK_SIZE - + "]]", + + "], chunkingSettings={\"strategy\":\"sentence\",\"max_chunk_size\":300,\"sentence_overlap\":0}]", DataType.KEYWORD, equalTo(expectedResult) ); @@ -92,9 +90,7 @@ public static Iterable parameters() { List.of(new TestCaseSupplier.TypedData(new BytesRef(text), DataType.TEXT, "str")), "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=LiteralsEvaluator[lit=" + Chunk.DEFAULT_NUM_CHUNKS - + "], chunkSize=LiteralsEvaluator[lit=" - + Chunk.DEFAULT_CHUNK_SIZE - + "]]", + + "], chunkingSettings={\"strategy\":\"sentence\",\"max_chunk_size\":300,\"sentence_overlap\":0}]", DataType.KEYWORD, equalTo(expectedResult) ); @@ -102,30 +98,31 @@ public static Iterable parameters() { ); } - private static MapExpression createOptionsMap(Integer numChunks, ChunkingSettings chunkingSettings) { - List keyValuePairs = new ArrayList<>(); + private static MapExpression createOptions(Integer numChunks, ChunkingSettings chunkingSettings) { + List options = new ArrayList<>(); if (Objects.nonNull(numChunks)) { - keyValuePairs.add(Literal.keyword(Source.EMPTY, NUM_CHUNKS)); - keyValuePairs.add(new Literal(Source.EMPTY, numChunks, DataType.INTEGER)); + options.add(Literal.keyword(Source.EMPTY, NUM_CHUNKS)); + options.add(new Literal(Source.EMPTY, numChunks, DataType.INTEGER)); } if (Objects.nonNull(chunkingSettings)) { - keyValuePairs.add(Literal.keyword(Source.EMPTY, CHUNKING_SETTINGS)); - keyValuePairs.add(new Literal(Source.EMPTY, chunkingSettings.asMap(), DataType.INTEGER)); + options.add(Literal.keyword(Source.EMPTY, CHUNKING_SETTINGS)); + List chunkingSettingsMap = new ArrayList<>(); + chunkingSettings.asMap().forEach((key, value) -> { + chunkingSettingsMap.add(Literal.keyword(Source.EMPTY, key)); + chunkingSettingsMap.add(new Literal(Source.EMPTY, value, DataType.INTEGER)); + }); + options.add(new MapExpression(Source.EMPTY, chunkingSettingsMap)); } - return new MapExpression(Source.EMPTY, keyValuePairs); + return new MapExpression(Source.EMPTY, options); } @Override protected Expression build(Source source, List args) { // With MapParam, args contains: field, options_map Expression options = args.size() < 2 ? null : args.get(1); - // TODO needed? - if (options instanceof Literal lit && lit.value() == null) { - options = null; - } return new Chunk(source, args.get(0), options); } @@ -135,11 +132,11 @@ public void testDefaults() { } public void testDefaultNumChunks() { - int chunkSize = 20; - verifyChunks(null, chunkSize, 8); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(20, 0); + verifyChunks(null, chunkingSettings, 8); } - public void testDefaultChunkSize() { + public void testDefaultChunkingSettings() { int numChunks = 1; // Default of 300 is huge, only one chunk returned in this case verifyChunks(numChunks, null, numChunks); } @@ -147,23 +144,28 @@ public void testDefaultChunkSize() { public void testSpecifiedOptions() { int numChunks = randomIntBetween(2, 4); int chunkSize = randomIntBetween(20, 30); - verifyChunks(numChunks, chunkSize, numChunks); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, randomIntBetween(0,1)); + verifyChunks(numChunks, chunkingSettings, numChunks); } - private void verifyChunks(Integer numChunks, Integer chunkSize, int expectedNumChunksReturned) { + private void verifyChunks(Integer numChunks, ChunkingSettings chunkingSettings, int expectedNumChunksReturned) { int numChunksOrDefault = numChunks != null ? numChunks : Chunk.DEFAULT_NUM_CHUNKS; - int chunkSizeOrDefault = chunkSize != null ? chunkSize : Chunk.DEFAULT_CHUNK_SIZE; - ChunkingSettings settings = new SentenceBoundaryChunkingSettings(chunkSizeOrDefault, 0); - List expected = Chunk.chunkText(PARAGRAPH_INPUT, settings, numChunksOrDefault).stream().map(String::trim).toList(); - - List result = process(PARAGRAPH_INPUT, numChunksOrDefault, settings); + ChunkingSettings chunkingSettingsOrDefault = chunkingSettings != null + ? chunkingSettings + : Chunk.DEFAULT_CHUNKING_SETTINGS; + List expected = Chunk.chunkText(PARAGRAPH_INPUT, chunkingSettingsOrDefault, numChunksOrDefault) + .stream() + .map(String::trim) + .toList(); + + List result = process(PARAGRAPH_INPUT, numChunksOrDefault, chunkingSettingsOrDefault); assertThat(result.size(), equalTo(expectedNumChunksReturned)); assertThat(result, equalTo(expected)); } private List process(String str, Integer numChunks, ChunkingSettings chunkingSettings) { MapExpression optionsMap = (numChunks == null && chunkingSettings == null) ? null : - createOptionsMap(numChunks, chunkingSettings); + createOptions(numChunks, chunkingSettings); try ( EvalOperator.ExpressionEvaluator eval = evaluator(new Chunk(Source.EMPTY, field("str", DataType.KEYWORD), optionsMap)).get( From 180d086cac6056d5e39b0466e1f16816785d62e9 Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Fri, 14 Nov 2025 21:54:35 +0000 Subject: [PATCH 08/15] [CI] Auto commit changes from spotless --- .../esql/expression/function/scalar/string/Chunk.java | 11 ++++++----- .../expression/function/scalar/string/ChunkTests.java | 9 +++------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index bb03c9a2a4c53..0a416e9e2faea 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -78,8 +78,9 @@ public Chunk( @MapParam.MapParamEntry( name = "chunking_settings", type = "object", - description = "The chunking settings with which to apply to the field. " + - "If no chunking settings are specified, defaults to sentence-based chunks of size " + DEFAULT_CHUNK_SIZE + description = "The chunking settings with which to apply to the field. " + + "If no chunking settings are specified, defaults to sentence-based chunks of size " + + DEFAULT_CHUNK_SIZE ), }, description = "Options to customize chunking behavior.", optional = true @@ -158,13 +159,13 @@ private TypeResolution validateOptions() { } } else if (CHUNKING_SETTINGS.equals(optionName)) { if (entry.value() instanceof MapExpression == false) { - return new TypeResolution( - "[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().dataType() + "]"); + return new TypeResolution("[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().dataType() + "]"); } return validateChunkingSettings(entry.value()); } else { return new TypeResolution( - "Invalid option [" + optionName + "], expected one of [" + String.join(", ", ALLOWED_OPTIONS.keySet()) + "]"); + "Invalid option [" + optionName + "], expected one of [" + String.join(", ", ALLOWED_OPTIONS.keySet()) + "]" + ); } } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java index 6bdd2358f6e5a..74a9856a0fd89 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java @@ -144,15 +144,13 @@ public void testDefaultChunkingSettings() { public void testSpecifiedOptions() { int numChunks = randomIntBetween(2, 4); int chunkSize = randomIntBetween(20, 30); - ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, randomIntBetween(0,1)); + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, randomIntBetween(0, 1)); verifyChunks(numChunks, chunkingSettings, numChunks); } private void verifyChunks(Integer numChunks, ChunkingSettings chunkingSettings, int expectedNumChunksReturned) { int numChunksOrDefault = numChunks != null ? numChunks : Chunk.DEFAULT_NUM_CHUNKS; - ChunkingSettings chunkingSettingsOrDefault = chunkingSettings != null - ? chunkingSettings - : Chunk.DEFAULT_CHUNKING_SETTINGS; + ChunkingSettings chunkingSettingsOrDefault = chunkingSettings != null ? chunkingSettings : Chunk.DEFAULT_CHUNKING_SETTINGS; List expected = Chunk.chunkText(PARAGRAPH_INPUT, chunkingSettingsOrDefault, numChunksOrDefault) .stream() .map(String::trim) @@ -164,8 +162,7 @@ private void verifyChunks(Integer numChunks, ChunkingSettings chunkingSettings, } private List process(String str, Integer numChunks, ChunkingSettings chunkingSettings) { - MapExpression optionsMap = (numChunks == null && chunkingSettings == null) ? null : - createOptions(numChunks, chunkingSettings); + MapExpression optionsMap = (numChunks == null && chunkingSettings == null) ? null : createOptions(numChunks, chunkingSettings); try ( EvalOperator.ExpressionEvaluator eval = evaluator(new Chunk(Source.EMPTY, field("str", DataType.KEYWORD), optionsMap)).get( From cbf96323ce9f18a45f98e7fc0dda91399ced4c49 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Wed, 19 Nov 2025 16:05:48 -0500 Subject: [PATCH 09/15] Remove num chunks, change options to chunking settings map --- .../_snippets/functions/examples/chunk.md | 31 ++++- .../functions/functionNamedParams/chunk.md | 6 - .../_snippets/functions/parameters/chunk.md | 4 +- .../esql/_snippets/functions/types/chunk.md | 2 +- .../esql/images/functions/chunk.svg | 2 +- .../kibana/definition/functions/chunk.json | 3 +- .../esql/kibana/docs/functions/chunk.md | 2 +- .../src/main/resources/chunk.csv-spec | 107 ++++++++++----- .../scalar/string/ChunkBytesRefEvaluator.java | 58 ++------ .../function/scalar/string/Chunk.java | 126 ++++-------------- .../xpack/esql/analysis/VerifierTests.java | 30 +---- .../function/scalar/string/ChunkTests.java | 69 +++++----- 12 files changed, 180 insertions(+), 260 deletions(-) diff --git a/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md index 4f875b1214fab..ec291cb115e3f 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/examples/chunk.md @@ -1,6 +1,6 @@ % This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. -**Example** +**Examples** ```{applies_to} stack: preview 9.3.0 @@ -8,15 +8,32 @@ stack: preview 9.3.0 ```esql FROM books -| EVAL chunks = CHUNK(description, {"num_chunks":1, "chunk_size":20}) +| EVAL chunks = CHUNK(description) ``` | book_no:keyword | title:text | chunks:keyword | | --- | --- | --- | -| 1211 | The brothers Karamazov | In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life. | -| 1463 | Realms of Tolkien: Images of Middle-earth | Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 | -| 1502 | Selected Passages from Correspondence with Friends | Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank. | -| 1937 | The Best Short Stories of Dostoevsky (Modern Library) | This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept | -| 1985 | Brothers Karamazov | Four brothers reunite in their hometown in Russia. | +| 1211 | The brothers Karamazov | In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life. Compelling, profound, complex, it is the story of a patricide and of the four sons who each had a motive for murder: Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, and twisted, cunning Smerdyakov, the bastard child. Frequently lurid, nightmarish, always brilliant, the novel plunges the reader into a sordid love triangle, a pathological obsession, and a gripping courtroom drama. But throughout the whole, Dostoevsky searhes for the truth--about man, about life, about the existence of God. A terrifying answer to man's eternal questions, this monumental work remains the crowning achievement of perhaps the finest novelist of all time. From the Paperback edition. | +| 1463 | Realms of Tolkien: Images of Middle-earth | Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 different scenes, each of which is accompanied by appropriate passage from The Hobbit and The Lord of the Rings and The Silmarillion | +| 1502 | Selected Passages from Correspondence with Friends | Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank. Many are reproduced here (the letters, not noses). | +| 1937 | The Best Short Stories of Dostoevsky (Modern Library) | This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept at the short story as with the novel. Exploring many of the same themes as in his longer works, these small masterpieces move from the tender and romantic White Nights, an archetypal nineteenth-century morality tale of pathos and loss, to the famous Notes from the Underground, a story of guilt, ineffectiveness, and uncompromising cynicism, and the first major work of existential literature. Among Dostoevsky's prototypical characters is Yemelyan in The Honest Thief, whose tragedy turns on an inability to resist crime. Presented in chronological order, in David Magarshack's celebrated translation, this is the definitive edition of Dostoevsky's best stories. | +| 1985 | Brothers Karamazov | Four brothers reunite in their hometown in Russia. The murder of their father forces the brothers to question their beliefs about each other, religion, and morality. | + +```{applies_to} +stack: preview 9.3.0 +``` + +```esql +FROM books +| EVAL chunks = CHUNK(description, {"strategy": "sentence", "max_chunk_size": 20, "sentence_overlap": 0}) +``` + +| book_no:keyword | title:text | chunks:keyword | +| --- | --- | --- | +| 1211 | The brothers Karamazov | [In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life., Compelling, profound, complex, it is the story of a patricide and of the four sons who each had a motive, of the four sons who each had a motive for murder: Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, : Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, and twisted, cunning Smerdyakov, the bastard child., Frequently lurid, nightmarish, always brilliant, the novel plunges the reader into a sordid love triangle, a pathological obsession, and a, a sordid love triangle, a pathological obsession, and a gripping courtroom drama., But throughout the whole, Dostoevsky searhes for the truth--about man, about life, about the existence of God., A terrifying answer to man's eternal questions, this monumental work remains the crowning achievement of perhaps the finest novelist of, the crowning achievement of perhaps the finest novelist of all time. From the Paperback edition.] | +| 1463 | Realms of Tolkien: Images of Middle-earth | [Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58, volume, breathing an extraordinary variety of life into 58 different scenes, each of which is accompanied by appropriate passage from, , each of which is accompanied by appropriate passage from The Hobbit and The Lord of the Rings and The Silmarillion] | +| 1502 | Selected Passages from Correspondence with Friends | [Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank., Many are reproduced here (the letters, not noses).] | +| 1937 | The Best Short Stories of Dostoevsky (Modern Library) | [This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept, key works and shows him to be equally adept at the short story as with the novel., Exploring many of the same themes as in his longer works, these small masterpieces move from the tender and romantic, , these small masterpieces move from the tender and romantic White Nights, an archetypal nineteenth-century morality tale of pathos and, , an archetypal nineteenth-century morality tale of pathos and loss, to the famous Notes from the Underground, a story of, the famous Notes from the Underground, a story of guilt, ineffectiveness, and uncompromising cynicism, and the first major work of, , and uncompromising cynicism, and the first major work of existential literature., Among Dostoevsky's prototypical characters is Yemelyan in The Honest Thief, whose tragedy turns on an inability to resist crime., Presented in chronological order, in David Magarshack's celebrated translation, this is the definitive edition of Dostoevsky's best stories.] | +| 1985 | Brothers Karamazov | [Four brothers reunite in their hometown in Russia., The murder of their father forces the brothers to question their beliefs about each other, religion, and morality.] | diff --git a/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md index 96c46bf834b8c..6a7405397851d 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/functionNamedParams/chunk.md @@ -2,9 +2,3 @@ **Supported function named parameters** -`chunking_settings` -: (object) The chunking settings with which to apply to the field. If no chunking settings are specified, defaults to sentence-based chunks of size 300 - -`num_chunks` -: (integer) The number of chunks to return. Defaults to return all chunks. - diff --git a/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md index f287627d571ee..3c4d856262f1a 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/parameters/chunk.md @@ -5,6 +5,6 @@ `field` : The input to chunk. -`options` -: Options to customize chunking behavior. +`chunking_settings` +: Options to customize chunking behavior. Refer to the [Inference API documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put#operation-inference-put-body-application-json-chunking_settings) for valid values for `chunking_settings`. diff --git a/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md index 8ebe22b61286c..19f035575cf25 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/types/chunk.md @@ -2,7 +2,7 @@ **Supported types** -| field | options | result | +| field | chunking_settings | result | | --- | --- | --- | | keyword | | keyword | | text | | keyword | diff --git a/docs/reference/query-languages/esql/images/functions/chunk.svg b/docs/reference/query-languages/esql/images/functions/chunk.svg index 56003f305a080..0031ba125d06c 100644 --- a/docs/reference/query-languages/esql/images/functions/chunk.svg +++ b/docs/reference/query-languages/esql/images/functions/chunk.svg @@ -1 +1 @@ -CHUNK(field,options) \ No newline at end of file +CHUNK(field,chunking_settings) \ No newline at end of file diff --git a/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json b/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json index 2be5b9665c320..9347b38f28575 100644 --- a/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json +++ b/docs/reference/query-languages/esql/kibana/definition/functions/chunk.json @@ -30,7 +30,8 @@ } ], "examples" : [ - "FROM books\n| EVAL chunks = CHUNK(description, {\"num_chunks\":1, \"chunk_size\":20})" + "FROM books\n| EVAL chunks = CHUNK(description)", + "FROM books\n| EVAL chunks = CHUNK(description, {\"strategy\": \"sentence\", \"max_chunk_size\": 20, \"sentence_overlap\": 0})" ], "preview" : true, "snapshot_only" : true diff --git a/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md b/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md index 2af9e41799859..c5f426e32cdda 100644 --- a/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md +++ b/docs/reference/query-languages/esql/kibana/docs/functions/chunk.md @@ -5,5 +5,5 @@ Use `CHUNK` to split a text field into smaller chunks. ```esql FROM books -| EVAL chunks = CHUNK(description, {"num_chunks":1, "chunk_size":20}) +| EVAL chunks = CHUNK(description) ``` diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index 1bf73acb2999b..39220912f42a9 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -2,12 +2,12 @@ # Tests for Chunk function # -chunkExample +chunkDefaults required_capability: chunk_function // tag::chunk-with-field[] FROM books -| EVAL chunks = CHUNK(description, {"num_chunks":1, "chunk_size":20}) +| EVAL chunks = CHUNK(description) // end::chunk-with-field[] | KEEP book_no, title, chunks | SORT book_no @@ -16,30 +16,34 @@ FROM books // tag::chunk-with-field-result[] book_no:keyword | title:text | chunks:keyword -1211 | The brothers Karamazov | In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life. -1463 | Realms of Tolkien: Images of Middle-earth | Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 -1502 | Selected Passages from Correspondence with Friends | Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank. -1937 | The Best Short Stories of Dostoevsky (Modern Library) | This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept -1985 | Brothers Karamazov | Four brothers reunite in their hometown in Russia. +1211 | The brothers Karamazov | In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life. Compelling, profound, complex, it is the story of a patricide and of the four sons who each had a motive for murder: Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, and twisted, cunning Smerdyakov, the bastard child. Frequently lurid, nightmarish, always brilliant, the novel plunges the reader into a sordid love triangle, a pathological obsession, and a gripping courtroom drama. But throughout the whole, Dostoevsky searhes for the truth--about man, about life, about the existence of God. A terrifying answer to man's eternal questions, this monumental work remains the crowning achievement of perhaps the finest novelist of all time. From the Paperback edition. +1463 | Realms of Tolkien: Images of Middle-earth | Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 different scenes, each of which is accompanied by appropriate passage from The Hobbit and The Lord of the Rings and The Silmarillion +1502 | Selected Passages from Correspondence with Friends | Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank. Many are reproduced here (the letters, not noses). +1937 | The Best Short Stories of Dostoevsky (Modern Library) | This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept at the short story as with the novel. Exploring many of the same themes as in his longer works, these small masterpieces move from the tender and romantic White Nights, an archetypal nineteenth-century morality tale of pathos and loss, to the famous Notes from the Underground, a story of guilt, ineffectiveness, and uncompromising cynicism, and the first major work of existential literature. Among Dostoevsky's prototypical characters is Yemelyan in The Honest Thief, whose tragedy turns on an inability to resist crime. Presented in chronological order, in David Magarshack's celebrated translation, this is the definitive edition of Dostoevsky's best stories. +1985 | Brothers Karamazov | Four brothers reunite in their hometown in Russia. The murder of their father forces the brothers to question their beliefs about each other, religion, and morality. // end::chunk-with-field-result[] ; -chunkDefaults +chunkWithChunkingSettings required_capability: chunk_function +// tag::chunk-with-chunking-settings[] FROM books -| EVAL chunks = CHUNK(description) +| EVAL chunks = CHUNK(description, {"strategy": "sentence", "max_chunk_size": 20, "sentence_overlap": 0}) +// end::chunk-with-chunking-settings[] | KEEP book_no, title, chunks | SORT book_no | LIMIT 5 ; +// tag::chunk-with-chunking-settings-result[] book_no:keyword | title:text | chunks:keyword -1211 | The brothers Karamazov | In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life. Compelling, profound, complex, it is the story of a patricide and of the four sons who each had a motive for murder: Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, and twisted, cunning Smerdyakov, the bastard child. Frequently lurid, nightmarish, always brilliant, the novel plunges the reader into a sordid love triangle, a pathological obsession, and a gripping courtroom drama. But throughout the whole, Dostoevsky searhes for the truth--about man, about life, about the existence of God. A terrifying answer to man's eternal questions, this monumental work remains the crowning achievement of perhaps the finest novelist of all time. From the Paperback edition. -1463 | Realms of Tolkien: Images of Middle-earth | Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 different scenes, each of which is accompanied by appropriate passage from The Hobbit and The Lord of the Rings and The Silmarillion -1502 | Selected Passages from Correspondence with Friends | Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank. Many are reproduced here (the letters, not noses). -1937 | The Best Short Stories of Dostoevsky (Modern Library) | This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept at the short story as with the novel. Exploring many of the same themes as in his longer works, these small masterpieces move from the tender and romantic White Nights, an archetypal nineteenth-century morality tale of pathos and loss, to the famous Notes from the Underground, a story of guilt, ineffectiveness, and uncompromising cynicism, and the first major work of existential literature. Among Dostoevsky's prototypical characters is Yemelyan in The Honest Thief, whose tragedy turns on an inability to resist crime. Presented in chronological order, in David Magarshack's celebrated translation, this is the definitive edition of Dostoevsky's best stories. -1985 | Brothers Karamazov | Four brothers reunite in their hometown in Russia. The murder of their father forces the brothers to question their beliefs about each other, religion, and morality. +1211 | The brothers Karamazov | [In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his life., Compelling, profound, complex, it is the story of a patricide and of the four sons who each had a motive, of the four sons who each had a motive for murder: Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, : Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, and twisted, cunning Smerdyakov, the bastard child., Frequently lurid, nightmarish, always brilliant, the novel plunges the reader into a sordid love triangle, a pathological obsession, and a, a sordid love triangle, a pathological obsession, and a gripping courtroom drama., But throughout the whole, Dostoevsky searhes for the truth--about man, about life, about the existence of God., A terrifying answer to man's eternal questions, this monumental work remains the crowning achievement of perhaps the finest novelist of, the crowning achievement of perhaps the finest novelist of all time. From the Paperback edition.] +1463 | Realms of Tolkien: Images of Middle-earth | [Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58, volume, breathing an extraordinary variety of life into 58 different scenes, each of which is accompanied by appropriate passage from, , each of which is accompanied by appropriate passage from The Hobbit and The Lord of the Rings and The Silmarillion] +1502 | Selected Passages from Correspondence with Friends | [Nikolai Gogol wrote some letters to his friends, none of which were a nose of high rank., Many are reproduced here (the letters, not noses).] +1937 | The Best Short Stories of Dostoevsky (Modern Library) | [This collection, unique to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept, key works and shows him to be equally adept at the short story as with the novel., Exploring many of the same themes as in his longer works, these small masterpieces move from the tender and romantic, , these small masterpieces move from the tender and romantic White Nights, an archetypal nineteenth-century morality tale of pathos and, , an archetypal nineteenth-century morality tale of pathos and loss, to the famous Notes from the Underground, a story of, the famous Notes from the Underground, a story of guilt, ineffectiveness, and uncompromising cynicism, and the first major work of, , and uncompromising cynicism, and the first major work of existential literature., Among Dostoevsky's prototypical characters is Yemelyan in The Honest Thief, whose tragedy turns on an inability to resist crime., Presented in chronological order, in David Magarshack's celebrated translation, this is the definitive edition of Dostoevsky's best stories.] +1985 | Brothers Karamazov | [Four brothers reunite in their hometown in Russia., The murder of their father forces the brothers to question their beliefs about each other, religion, and morality.] +// end::chunk-with-chunking-settings-result[] ; chunkTextWithMatch @@ -47,65 +51,102 @@ required_capability: chunk_function FROM books | WHERE MATCH(title, "Return") -| EVAL chunks = CHUNK(description, {"num_chunks":1, "chunk_size":20}) +| EVAL chunks = CHUNK(description, {"strategy": "sentence", "max_chunk_size": 20, "sentence_overlap": 0}) | KEEP book_no, title, chunks; ignoreOrder:true book_no:keyword | title:text | chunks:keyword -2714 | Return of the King Being the Third Part of The Lord of the Rings | Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of -7350 | Return of the Shadow | In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings. +2714 | Return of the King Being the Third Part of The Lord of the Rings | [Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of, part of Tolkien s epic masterpiece, The Lord of the Rings, featuring an exclusive cover image from the film, the, , featuring an exclusive cover image from the film, the definitive text, and a detailed map of Middle-earth., The armies of the Dark Lord Sauron are massing as his evil shadow spreads ever wider., Men, Dwarves, Elves and Ents unite forces to do battle agains the Dark., Meanwhile, Frodo and Sam struggle further into Mordor in their heroic quest to destroy the One Ring., The devastating conclusion of J.R.R., Tolkien s classic tale of magic and adventure, begun in The Fellowship of the Ring and The Two Towers, features, Fellowship of the Ring and The Two Towers, features the definitive edition of the text and includes the Appendices and, edition of the text and includes the Appendices and a revised Index in full., To celebrate the release of the first of Peter Jackson s two-part film adaptation of The Hobbit, THE HOBBIT, two-part film adaptation of The Hobbit, THE HOBBIT: AN UNEXPECTED JOURNEY, this third part of The Lord of the, JOURNEY, this third part of The Lord of the Rings is available for a limited time with an exclusive cover, available for a limited time with an exclusive cover image from Peter Jackson s award-winning trilogy.] +7350 | Return of the Shadow | [In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings., In The Return of the Shadow (an abandoned title for the first volume) Christopher Tolkien describes, with full citation of, first volume) Christopher Tolkien describes, with full citation of the earliest notes, outline plans, and narrative drafts, the intricate evolution, notes, outline plans, and narrative drafts, the intricate evolution of The Fellowship of the Ring and the gradual emergence of, Fellowship of the Ring and the gradual emergence of the conceptions that transformed what J.R.R., Tolkien for long believed would be a far shorter book, 'a sequel to The Hobbit'., The enlargement of Bilbo's 'magic ring' into the supremely potent and dangerous Ruling Ring of the Dark Lord is traced, dangerous Ruling Ring of the Dark Lord is traced and the precise moment is seen when, in an astonishing and, precise moment is seen when, in an astonishing and unforeseen leap in the earliest narrative, a Black Rider first rode, in the earliest narrative, a Black Rider first rode into the Shire, his significance still unknown., The character of the hobbit called Trotter (afterwards Strider or Aragorn) is developed while his indentity remains an absolute puzzle, ) is developed while his indentity remains an absolute puzzle, and the suspicion only very slowly becomes certainty that he must, suspicion only very slowly becomes certainty that he must after all be a Man., The hobbits, Frodo's companions, undergo intricate permutations of name and personality, and other major figures appear in strange modes: a, , and other major figures appear in strange modes: a sinister Treebeard, in league with the Enemy, a ferocious and malevolent, , in league with the Enemy, a ferocious and malevolent Farmer Maggot., The story in this book ends at the point where J.R.R., Tolkien halted in the story for a long time, as the Company of the Ring, still lacking Legolas and Gimli, Company of the Ring, still lacking Legolas and Gimli, stood before the tomb of Balin in the Mines of Moria., The Return of the Shadow is illustrated with reproductions of the first maps and notable pages from the earliest manuscripts.] ; -chunkTextWithMatchMultipleChunks +chunkTextWithMatchMultipleChunksMvExpand required_capability: chunk_function FROM books | WHERE MATCH(title, "Return") -| EVAL chunks = CHUNK(description, {"num_chunks":3, "chunk_size":20}) +| EVAL chunks = CHUNK(description, {"strategy": "sentence", "max_chunk_size": 20, "sentence_overlap": 0}) +| MV_EXPAND chunks | KEEP book_no, title, chunks; ignoreOrder:true book_no:keyword | title:text | chunks:keyword -2714 | Return of the King Being the Third Part of The Lord of the Rings | [Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of, part of Tolkien s epic masterpiece, The Lord of the Rings, featuring an exclusive cover image from the film, the, , featuring an exclusive cover image from the film, the definitive text, and a detailed map of Middle-earth.] -7350 | Return of the Shadow | [In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings., In The Return of the Shadow (an abandoned title for the first volume) Christopher Tolkien describes, with full citation of, first volume) Christopher Tolkien describes, with full citation of the earliest notes, outline plans, and narrative drafts, the intricate evolution] +2714 | Return of the King Being the Third Part of The Lord of the Rings | , featuring an exclusive cover image from the film, the definitive text, and a detailed map of Middle-earth. +2714 | Return of the King Being the Third Part of The Lord of the Rings | Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of +2714 | Return of the King Being the Third Part of The Lord of the Rings | Fellowship of the Ring and The Two Towers, features the definitive edition of the text and includes the Appendices and +2714 | Return of the King Being the Third Part of The Lord of the Rings | JOURNEY, this third part of The Lord of the Rings is available for a limited time with an exclusive cover +2714 | Return of the King Being the Third Part of The Lord of the Rings | Meanwhile, Frodo and Sam struggle further into Mordor in their heroic quest to destroy the One Ring. +2714 | Return of the King Being the Third Part of The Lord of the Rings | Men, Dwarves, Elves and Ents unite forces to do battle agains the Dark. +2714 | Return of the King Being the Third Part of The Lord of the Rings | The armies of the Dark Lord Sauron are massing as his evil shadow spreads ever wider. +2714 | Return of the King Being the Third Part of The Lord of the Rings | The devastating conclusion of J.R.R. +2714 | Return of the King Being the Third Part of The Lord of the Rings | To celebrate the release of the first of Peter Jackson s two-part film adaptation of The Hobbit, THE HOBBIT +2714 | Return of the King Being the Third Part of The Lord of the Rings | Tolkien s classic tale of magic and adventure, begun in The Fellowship of the Ring and The Two Towers, features +2714 | Return of the King Being the Third Part of The Lord of the Rings | available for a limited time with an exclusive cover image from Peter Jackson s award-winning trilogy. +2714 | Return of the King Being the Third Part of The Lord of the Rings | edition of the text and includes the Appendices and a revised Index in full. +2714 | Return of the King Being the Third Part of The Lord of the Rings | part of Tolkien s epic masterpiece, The Lord of the Rings, featuring an exclusive cover image from the film, the +2714 | Return of the King Being the Third Part of The Lord of the Rings | two-part film adaptation of The Hobbit, THE HOBBIT: AN UNEXPECTED JOURNEY, this third part of The Lord of the +7350 | Return of the Shadow | ) is developed while his indentity remains an absolute puzzle, and the suspicion only very slowly becomes certainty that he must +7350 | Return of the Shadow | , and other major figures appear in strange modes: a sinister Treebeard, in league with the Enemy, a ferocious and malevolent +7350 | Return of the Shadow | , in league with the Enemy, a ferocious and malevolent Farmer Maggot. +7350 | Return of the Shadow | Company of the Ring, still lacking Legolas and Gimli, stood before the tomb of Balin in the Mines of Moria. +7350 | Return of the Shadow | Fellowship of the Ring and the gradual emergence of the conceptions that transformed what J.R.R. +7350 | Return of the Shadow | In The Return of the Shadow (an abandoned title for the first volume) Christopher Tolkien describes, with full citation of +7350 | Return of the Shadow | In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings. +7350 | Return of the Shadow | The Return of the Shadow is illustrated with reproductions of the first maps and notable pages from the earliest manuscripts. +7350 | Return of the Shadow | The character of the hobbit called Trotter (afterwards Strider or Aragorn) is developed while his indentity remains an absolute puzzle +7350 | Return of the Shadow | The enlargement of Bilbo's 'magic ring' into the supremely potent and dangerous Ruling Ring of the Dark Lord is traced +7350 | Return of the Shadow | The hobbits, Frodo's companions, undergo intricate permutations of name and personality, and other major figures appear in strange modes: a +7350 | Return of the Shadow | The story in this book ends at the point where J.R.R. +7350 | Return of the Shadow | Tolkien for long believed would be a far shorter book, 'a sequel to The Hobbit'. +7350 | Return of the Shadow | Tolkien halted in the story for a long time, as the Company of the Ring, still lacking Legolas and Gimli +7350 | Return of the Shadow | dangerous Ruling Ring of the Dark Lord is traced and the precise moment is seen when, in an astonishing and +7350 | Return of the Shadow | first volume) Christopher Tolkien describes, with full citation of the earliest notes, outline plans, and narrative drafts, the intricate evolution +7350 | Return of the Shadow | in the earliest narrative, a Black Rider first rode into the Shire, his significance still unknown. +7350 | Return of the Shadow | notes, outline plans, and narrative drafts, the intricate evolution of The Fellowship of the Ring and the gradual emergence of +7350 | Return of the Shadow | precise moment is seen when, in an astonishing and unforeseen leap in the earliest narrative, a Black Rider first rode +7350 | Return of the Shadow | suspicion only very slowly becomes certainty that he must after all be a Man. ; -chunkTextWithMatchMultipleChunksMvExpand +chunkTextWithMatchMultipleChunksMvSliceMvExpand required_capability: chunk_function FROM books | WHERE MATCH(title, "Return") -| EVAL chunks = CHUNK(description, {"num_chunks":3, "chunk_size":20}) -| MV_EXPAND chunks -| KEEP book_no, title, chunks; +| EVAL chunks = CHUNK(description, {"strategy": "sentence", "max_chunk_size": 20, "sentence_overlap": 0}) +| EVAL truncated = MV_SLICE(chunks, 0, 3) +| MV_EXPAND truncated +| KEEP book_no, title, truncated; ignoreOrder:true -book_no:keyword | title:text | chunks:keyword +book_no:keyword | title:text | truncated:keyword 2714 | Return of the King Being the Third Part of The Lord of the Rings | , featuring an exclusive cover image from the film, the definitive text, and a detailed map of Middle-earth. 2714 | Return of the King Being the Third Part of The Lord of the Rings | Concluding the story begun in The Hobbit, this is the final part of Tolkien s epic masterpiece, The Lord of +2714 | Return of the King Being the Third Part of The Lord of the Rings | The armies of the Dark Lord Sauron are massing as his evil shadow spreads ever wider. 2714 | Return of the King Being the Third Part of The Lord of the Rings | part of Tolkien s epic masterpiece, The Lord of the Rings, featuring an exclusive cover image from the film, the 7350 | Return of the Shadow | In The Return of the Shadow (an abandoned title for the first volume) Christopher Tolkien describes, with full citation of 7350 | Return of the Shadow | In this sixth volume of The History of Middle-earth the story reaches The Lord of the Rings. 7350 | Return of the Shadow | first volume) Christopher Tolkien describes, with full citation of the earliest notes, outline plans, and narrative drafts, the intricate evolution +7350 | Return of the Shadow | notes, outline plans, and narrative drafts, the intricate evolution of The Fellowship of the Ring and the gradual emergence of ; + chunkTextWithConcatenatedField required_capability: chunk_function FROM books -| EVAL title_description = CONCAT(title, description) -| EVAL chunks = CHUNK(title_description, {"num_chunks":1, "chunk_size":20}) +| EVAL title_description = CONCAT(title, " ", description) +| EVAL chunks = CHUNK(title_description, {"strategy": "sentence", "max_chunk_size": 20, "sentence_overlap": 0}) | KEEP book_no, title, chunks | SORT book_no | LIMIT 5 ; book_no:keyword | title:text | chunks:keyword -1211 | The brothers Karamazov | The brothers KaramazovIn 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all his -1463 | Realms of Tolkien: Images of Middle-earth | Realms of Tolkien: Images of Middle-earthTwenty new and familiar Tolkien artists are represented in this fabulous volume, breathing an -1502 | Selected Passages from Correspondence with Friends | Selected Passages from Correspondence with FriendsNikolai Gogol wrote some letters to his friends, none of which were a nose of -1937 | The Best Short Stories of Dostoevsky (Modern Library) | The Best Short Stories of Dostoevsky (Modern Library)This collection, unique to the Modern Library, gathers seven of Dostoevsky's key -1985 | Brothers Karamazov | Brothers KaramazovFour brothers reunite in their hometown in Russia. +1211 | The brothers Karamazov | [The brothers Karamazov In 1880 Dostoevsky completed The Brothers Karamazov, the literary effort for which he had been preparing all, literary effort for which he had been preparing all his life., Compelling, profound, complex, it is the story of a patricide and of the four sons who each had a motive, of the four sons who each had a motive for murder: Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, : Dmitry, the sensualist, Ivan, the intellectual, Alyosha, the mystic, and twisted, cunning Smerdyakov, the bastard child., Frequently lurid, nightmarish, always brilliant, the novel plunges the reader into a sordid love triangle, a pathological obsession, and a, a sordid love triangle, a pathological obsession, and a gripping courtroom drama., But throughout the whole, Dostoevsky searhes for the truth--about man, about life, about the existence of God., A terrifying answer to man's eternal questions, this monumental work remains the crowning achievement of perhaps the finest novelist of, the crowning achievement of perhaps the finest novelist of all time. From the Paperback edition.] +1463 | Realms of Tolkien: Images of Middle-earth | [Realms of Tolkien: Images of Middle-earth Twenty new and familiar Tolkien artists are represented in this fabulous volume, breathing, Tolkien artists are represented in this fabulous volume, breathing an extraordinary variety of life into 58 different scenes, each of, variety of life into 58 different scenes, each of which is accompanied by appropriate passage from The Hobbit and The, accompanied by appropriate passage from The Hobbit and The Lord of the Rings and The Silmarillion] +1502 | Selected Passages from Correspondence with Friends | [Selected Passages from Correspondence with Friends Nikolai Gogol wrote some letters to his friends, none of which were a nose, to his friends, none of which were a nose of high rank. Many are reproduced here (the letters, not noses).] +1937 | The Best Short Stories of Dostoevsky (Modern Library) | [The Best Short Stories of Dostoevsky (Modern Library) This collection, unique to the Modern Library, gathers seven of Dostoevsky's key, to the Modern Library, gathers seven of Dostoevsky's key works and shows him to be equally adept at the short, shows him to be equally adept at the short story as with the novel., Exploring many of the same themes as in his longer works, these small masterpieces move from the tender and romantic, , these small masterpieces move from the tender and romantic White Nights, an archetypal nineteenth-century morality tale of pathos and, , an archetypal nineteenth-century morality tale of pathos and loss, to the famous Notes from the Underground, a story of, the famous Notes from the Underground, a story of guilt, ineffectiveness, and uncompromising cynicism, and the first major work of, , and uncompromising cynicism, and the first major work of existential literature., Among Dostoevsky's prototypical characters is Yemelyan in The Honest Thief, whose tragedy turns on an inability to resist crime., Presented in chronological order, in David Magarshack's celebrated translation, this is the definitive edition of Dostoevsky's best stories.] +1985 | Brothers Karamazov | [Brothers Karamazov Four brothers reunite in their hometown in Russia., The murder of their father forces the brothers to question their beliefs about each other, religion, and morality.] ; chunkTextWithMultivaluedField diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java index 15f26616fc715..2d393f4008b2f 100644 --- a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java +++ b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkBytesRefEvaluator.java @@ -12,8 +12,6 @@ import org.elasticsearch.compute.data.Block; import org.elasticsearch.compute.data.BytesRefBlock; import org.elasticsearch.compute.data.BytesRefVector; -import org.elasticsearch.compute.data.IntBlock; -import org.elasticsearch.compute.data.IntVector; import org.elasticsearch.compute.data.Page; import org.elasticsearch.compute.operator.DriverContext; import org.elasticsearch.compute.operator.EvalOperator; @@ -33,8 +31,6 @@ public final class ChunkBytesRefEvaluator implements EvalOperator.ExpressionEval private final EvalOperator.ExpressionEvaluator str; - private final EvalOperator.ExpressionEvaluator numChunks; - private final ChunkingSettings chunkingSettings; private final DriverContext driverContext; @@ -42,11 +38,9 @@ public final class ChunkBytesRefEvaluator implements EvalOperator.ExpressionEval private Warnings warnings; public ChunkBytesRefEvaluator(Source source, EvalOperator.ExpressionEvaluator str, - EvalOperator.ExpressionEvaluator numChunks, ChunkingSettings chunkingSettings, - DriverContext driverContext) { + ChunkingSettings chunkingSettings, DriverContext driverContext) { this.source = source; this.str = str; - this.numChunks = numChunks; this.chunkingSettings = chunkingSettings; this.driverContext = driverContext; } @@ -54,17 +48,11 @@ public ChunkBytesRefEvaluator(Source source, EvalOperator.ExpressionEvaluator st @Override public Block eval(Page page) { try (BytesRefBlock strBlock = (BytesRefBlock) str.eval(page)) { - try (IntBlock numChunksBlock = (IntBlock) numChunks.eval(page)) { - BytesRefVector strVector = strBlock.asVector(); - if (strVector == null) { - return eval(page.getPositionCount(), strBlock, numChunksBlock); - } - IntVector numChunksVector = numChunksBlock.asVector(); - if (numChunksVector == null) { - return eval(page.getPositionCount(), strBlock, numChunksBlock); - } - return eval(page.getPositionCount(), strVector, numChunksVector); + BytesRefVector strVector = strBlock.asVector(); + if (strVector == null) { + return eval(page.getPositionCount(), strBlock); } + return eval(page.getPositionCount(), strVector); } } @@ -72,11 +60,10 @@ public Block eval(Page page) { public long baseRamBytesUsed() { long baseRamBytesUsed = BASE_RAM_BYTES_USED; baseRamBytesUsed += str.baseRamBytesUsed(); - baseRamBytesUsed += numChunks.baseRamBytesUsed(); return baseRamBytesUsed; } - public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock numChunksBlock) { + public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock) { try(BytesRefBlock.Builder result = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { BytesRef strScratch = new BytesRef(); position: for (int p = 0; p < positionCount; p++) { @@ -91,33 +78,19 @@ public BytesRefBlock eval(int positionCount, BytesRefBlock strBlock, IntBlock nu result.appendNull(); continue position; } - switch (numChunksBlock.getValueCount(p)) { - case 0: - result.appendNull(); - continue position; - case 1: - break; - default: - warnings().registerException(new IllegalArgumentException("single-value function encountered multi-value")); - result.appendNull(); - continue position; - } BytesRef str = strBlock.getBytesRef(strBlock.getFirstValueIndex(p), strScratch); - int numChunks = numChunksBlock.getInt(numChunksBlock.getFirstValueIndex(p)); - Chunk.process(result, str, numChunks, this.chunkingSettings); + Chunk.process(result, str, this.chunkingSettings); } return result.build(); } } - public BytesRefBlock eval(int positionCount, BytesRefVector strVector, - IntVector numChunksVector) { + public BytesRefBlock eval(int positionCount, BytesRefVector strVector) { try(BytesRefBlock.Builder result = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { BytesRef strScratch = new BytesRef(); position: for (int p = 0; p < positionCount; p++) { BytesRef str = strVector.getBytesRef(p, strScratch); - int numChunks = numChunksVector.getInt(p); - Chunk.process(result, str, numChunks, this.chunkingSettings); + Chunk.process(result, str, this.chunkingSettings); } return result.build(); } @@ -125,12 +98,12 @@ public BytesRefBlock eval(int positionCount, BytesRefVector strVector, @Override public String toString() { - return "ChunkBytesRefEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkingSettings=" + chunkingSettings + "]"; + return "ChunkBytesRefEvaluator[" + "str=" + str + ", chunkingSettings=" + chunkingSettings + "]"; } @Override public void close() { - Releasables.closeExpectNoException(str, numChunks); + Releasables.closeExpectNoException(str); } private Warnings warnings() { @@ -150,26 +123,23 @@ static class Factory implements EvalOperator.ExpressionEvaluator.Factory { private final EvalOperator.ExpressionEvaluator.Factory str; - private final EvalOperator.ExpressionEvaluator.Factory numChunks; - private final ChunkingSettings chunkingSettings; public Factory(Source source, EvalOperator.ExpressionEvaluator.Factory str, - EvalOperator.ExpressionEvaluator.Factory numChunks, ChunkingSettings chunkingSettings) { + ChunkingSettings chunkingSettings) { this.source = source; this.str = str; - this.numChunks = numChunks; this.chunkingSettings = chunkingSettings; } @Override public ChunkBytesRefEvaluator get(DriverContext context) { - return new ChunkBytesRefEvaluator(source, str.get(context), numChunks.get(context), chunkingSettings, context); + return new ChunkBytesRefEvaluator(source, str.get(context), chunkingSettings, context); } @Override public String toString() { - return "ChunkBytesRefEvaluator[" + "str=" + str + ", numChunks=" + numChunks + ", chunkingSettings=" + chunkingSettings + "]"; + return "ChunkBytesRefEvaluator[" + "str=" + str + ", chunkingSettings=" + chunkingSettings + "]"; } } } diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 0a416e9e2faea..4993f93e02192 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -47,48 +47,32 @@ public class Chunk extends EsqlScalarFunction implements OptionalArgument { public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "Chunk", Chunk::new); - public static final int DEFAULT_NUM_CHUNKS = Integer.MAX_VALUE; static final int DEFAULT_CHUNK_SIZE = 300; public static final ChunkingSettings DEFAULT_CHUNKING_SETTINGS = new SentenceBoundaryChunkingSettings(DEFAULT_CHUNK_SIZE, 0); - private final Expression field, options; - - static final String NUM_CHUNKS = "num_chunks"; - static final String CHUNKING_SETTINGS = "chunking_settings"; - - public static final Map ALLOWED_OPTIONS = Map.of(NUM_CHUNKS, DataType.INTEGER, CHUNKING_SETTINGS, DataType.OBJECT); + private final Expression field, chunkingSettings; @FunctionInfo(returnType = "keyword", preview = true, description = """ Use `CHUNK` to split a text field into smaller chunks.""", detailedDescription = """ Chunk can be used on fields from the text famiy like <> and <>. Chunk will split a text field into smaller chunks, using a sentence-based chunking strategy. The number of chunks returned, and the length of the sentences used to create the chunks can be specified. - """, examples = { @Example(file = "chunk", tag = "chunk-with-field", applies_to = "stack: preview 9.3.0") }) + """, examples = { @Example(file = "chunk", tag = "chunk-with-field", applies_to = "stack: preview 9.3.0"), + @Example(file = "chunk", tag = "chunk-with-chunking-settings", applies_to = "stack: preview 9.3.0") }) public Chunk( Source source, @Param(name = "field", type = { "keyword", "text" }, description = "The input to chunk.") Expression field, @MapParam( - name = "options", - params = { - @MapParam.MapParamEntry( - name = "num_chunks", - type = "integer", - description = "The number of chunks to return. Defaults to return all chunks." - ), - @MapParam.MapParamEntry( - name = "chunking_settings", - type = "object", - description = "The chunking settings with which to apply to the field. " - + "If no chunking settings are specified, defaults to sentence-based chunks of size " - + DEFAULT_CHUNK_SIZE - ), }, - description = "Options to customize chunking behavior.", + name = "chunking_settings", + description = "Options to customize chunking behavior. Refer to the " + + "[Inference API documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put" + + "#operation-inference-put-body-application-json-chunking_settings) for valid values for `chunking_settings`.", optional = true - ) Expression options + ) Expression chunkingSettings ) { - super(source, options == null ? List.of(field) : List.of(field, options)); + super(source, chunkingSettings == null ? List.of(field) : List.of(field, chunkingSettings)); this.field = field; - this.options = options; + this.chunkingSettings = chunkingSettings; } public Chunk(StreamInput in) throws IOException { @@ -103,7 +87,7 @@ public Chunk(StreamInput in) throws IOException { public void writeTo(StreamOutput out) throws IOException { source().writeTo(out); out.writeNamedWriteable(field); - out.writeOptionalNamedWriteable(options); + out.writeOptionalNamedWriteable(chunkingSettings); } @Override @@ -127,54 +111,14 @@ protected TypeResolution resolveType() { return fieldResolution; } - return options == null ? TypeResolution.TYPE_RESOLVED : validateOptions(); - } - - private TypeResolution validateOptions() { - // TODO - Options#resolve should play nicely with nested MapExpressions, doing a hacky manual evaluation for now - if (options instanceof MapExpression == false) { - return new TypeResolution("second argument of [" + sourceText() + "] must be a map"); - } - - MapExpression mapExpr = (MapExpression) options; - for (EntryExpression entry : mapExpr.entryExpressions()) { - if (entry.key() instanceof Literal == false || entry.key().foldable() == false) { - return new TypeResolution("option names must be constants in [" + sourceText() + "]"); - } - - Object keyValue = ((Literal) entry.key()).value(); - String optionName = keyValue instanceof BytesRef br ? br.utf8ToString() : keyValue.toString(); - - if (NUM_CHUNKS.equals(optionName)) { - if (entry.value() instanceof Literal == false) { - return new TypeResolution("[" + NUM_CHUNKS + "] must be a constant"); - } - Literal value = (Literal) entry.value(); - if (value.dataType() != DataType.INTEGER) { - return new TypeResolution("[" + NUM_CHUNKS + "] must be an integer, found [" + value.dataType() + "]"); - } - Integer numChunks = (Integer) value.value(); - if (numChunks != null && numChunks < 0) { - return new TypeResolution("[" + NUM_CHUNKS + "] cannot be negative, found [" + numChunks + "]"); - } - } else if (CHUNKING_SETTINGS.equals(optionName)) { - if (entry.value() instanceof MapExpression == false) { - return new TypeResolution("[" + CHUNKING_SETTINGS + "] must be a map, found [" + entry.value().dataType() + "]"); - } - return validateChunkingSettings(entry.value()); - } else { - return new TypeResolution( - "Invalid option [" + optionName + "], expected one of [" + String.join(", ", ALLOWED_OPTIONS.keySet()) + "]" - ); - } - } - - return TypeResolution.TYPE_RESOLVED; + return chunkingSettings == null ? TypeResolution.TYPE_RESOLVED : validateChunkingSettings(); } - private TypeResolution validateChunkingSettings(Expression chunkingSettings) { + private TypeResolution validateChunkingSettings() { // Just ensure all keys and values are literals - defer valid chunking settings for validation later - assert chunkingSettings instanceof MapExpression; + if (chunkingSettings instanceof MapExpression == false) { + return new TypeResolution("chunking_settings must be a map"); + } MapExpression chunkingSettingsMap = (MapExpression) chunkingSettings; for (EntryExpression entry : chunkingSettingsMap.entryExpressions()) { if (entry.key() instanceof Literal == false || (entry.key()).foldable() == false) { @@ -189,7 +133,7 @@ private TypeResolution validateChunkingSettings(Expression chunkingSettings) { @Override public boolean foldable() { - return field().foldable() && (options() == null || options().foldable()); + return field().foldable() && (chunkingSettings() == null || chunkingSettings().foldable()); } @Override @@ -203,22 +147,22 @@ public Expression replaceChildren(List newChildren) { @Override protected NodeInfo info() { - return NodeInfo.create(this, Chunk::new, field, options); + return NodeInfo.create(this, Chunk::new, field, chunkingSettings); } Expression field() { return field; } - Expression options() { - return options; + Expression chunkingSettings() { + return chunkingSettings; } @Evaluator(extraName = "BytesRef") - static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, @Fixed ChunkingSettings chunkingSettings) { + static void process(BytesRefBlock.Builder builder, BytesRef str, @Fixed ChunkingSettings chunkingSettings) { String content = str.utf8ToString(); - List chunks = chunkText(content, chunkingSettings, numChunks); + List chunks = chunkText(content, chunkingSettings); boolean multivalued = chunks.size() > 1; if (multivalued) { @@ -233,12 +177,11 @@ static void process(BytesRefBlock.Builder builder, BytesRef str, int numChunks, } } - public static List chunkText(String content, ChunkingSettings chunkingSettings, int numChunks) { + public static List chunkText(String content, ChunkingSettings chunkingSettings) { Chunker chunker = ChunkerBuilder.fromChunkingStrategy(chunkingSettings.getChunkingStrategy()); return chunker.chunk(content, chunkingSettings) .stream() .map(offset -> content.substring(offset.start(), offset.end())) - .limit(numChunks > 0 ? numChunks : DEFAULT_NUM_CHUNKS) .toList(); } @@ -246,39 +189,26 @@ public static List chunkText(String content, ChunkingSettings chunkingSe public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; Chunk chunk = (Chunk) o; - return Objects.equals(field(), chunk.field()) && Objects.equals(options(), chunk.options()); + return Objects.equals(field(), chunk.field()) && Objects.equals(chunkingSettings(), chunk.chunkingSettings()); } @Override public int hashCode() { - return Objects.hash(field(), options()); + return Objects.hash(field(), chunkingSettings()); } @Override public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { - int numChunks = DEFAULT_NUM_CHUNKS; ChunkingSettings chunkingSettings = DEFAULT_CHUNKING_SETTINGS; - if (options() != null) { - MapExpression mapExpr = (MapExpression) options(); - for (EntryExpression entry : mapExpr.entryExpressions()) { - Object keyValue = ((Literal) entry.key()).value(); - String optionName = keyValue instanceof BytesRef br ? br.utf8ToString() : keyValue.toString(); - - if (NUM_CHUNKS.equals(optionName)) { - numChunks = (Integer) ((Literal) entry.value()).value(); - } else if (CHUNKING_SETTINGS.equals(optionName)) { - // Convert the nested MapExpression to Map and build ChunkingSettings - Map chunkingSettingsMap = toMap((MapExpression) entry.value()); - chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); - } - } + if (chunkingSettings() != null) { + Map chunkingSettingsMap = toMap((MapExpression) chunkingSettings()); + chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); } return new ChunkBytesRefEvaluator.Factory( source(), toEvaluator.apply(field), - toEvaluator.apply(new Literal(source(), numChunks, DataType.INTEGER)), chunkingSettings ); } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 1de00620b898b..923ad07c418b2 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -3322,35 +3322,11 @@ public void testChunkFunctionInvalidInputs() { if (EsqlCapabilities.Cap.CHUNK_FUNCTION.isEnabled()) { assertThat( error( - "from test | EVAL chunks = CHUNK(body, {\"num_chunks\": null, \"chunk_size\": 20})", + "from test | EVAL chunks = CHUNK(body, null)", fullTextAnalyzer, - ParsingException.class + VerificationException.class ), - equalTo("1:39: Invalid named parameter [\"num_chunks\":null], NULL is not supported") - ); - assertThat( - error( - "from test | EVAL chunks = CHUNK(body, {\"num_chunks\": 3, \"chunk_size\": null})", - fullTextAnalyzer, - ParsingException.class - ), - equalTo("1:39: Invalid named parameter [\"chunk_size\":null], NULL is not supported") - ); - assertThat( - error("from test | EVAL chunks = CHUNK(body, {\"num_chunks\":\"foo\"})", fullTextAnalyzer), - equalTo("1:27: Invalid option [num_chunks] in [CHUNK(body, {\"num_chunks\":\"foo\"})], cannot cast [foo] to [integer]") - ); - assertThat( - error("from test | EVAL chunks = CHUNK(body, {\"chunk_size\":\"foo\"})", fullTextAnalyzer), - equalTo("1:27: Invalid option [chunk_size] in [CHUNK(body, {\"chunk_size\":\"foo\"})], cannot cast [foo] to [integer]") - ); - assertThat( - error("from test | EVAL chunks = CHUNK(body, {\"num_chunks\":-1})", fullTextAnalyzer), - equalTo("1:27: [num_chunks] cannot be negative, found [-1]") - ); - assertThat( - error("from test | EVAL chunks = CHUNK(body, {\"chunk_size\":-1})", fullTextAnalyzer), - equalTo("1:27: [chunk_size] cannot be negative, found [-1]") + equalTo("1:27: chunking_settings must be a map") ); } } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java index 74a9856a0fd89..c0599e49237d9 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java @@ -31,8 +31,7 @@ import java.util.stream.IntStream; import static org.elasticsearch.compute.data.BlockUtils.toJavaObject; -import static org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk.CHUNKING_SETTINGS; -import static org.elasticsearch.xpack.esql.expression.function.scalar.string.Chunk.NUM_CHUNKS; +import static org.elasticsearch.xpack.core.inference.chunking.ChunkingSettingsTests.createRandomChunkingSettings; import static org.hamcrest.Matchers.equalTo; public class ChunkTests extends AbstractScalarFunctionTestCase { @@ -64,16 +63,15 @@ public static Iterable parameters() { String text = randomWordsBetween(25, 50); ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); - List chunks = Chunk.chunkText(text, chunkingSettings, Chunk.DEFAULT_NUM_CHUNKS); + List chunks = Chunk.chunkText(text, chunkingSettings); Object expectedResult = chunks.size() == 1 ? new BytesRef(chunks.get(0).trim()) : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); return new TestCaseSupplier.TestCase( List.of(new TestCaseSupplier.TypedData(new BytesRef(text), DataType.KEYWORD, "str")), - "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=LiteralsEvaluator[lit=" - + Chunk.DEFAULT_NUM_CHUNKS - + "], chunkingSettings={\"strategy\":\"sentence\",\"max_chunk_size\":300,\"sentence_overlap\":0}]", + "ChunkBytesRefEvaluator[str=Attribute[channel=0], " + + "chunkingSettings={\"strategy\":\"sentence\",\"max_chunk_size\":300,\"sentence_overlap\":0}]", DataType.KEYWORD, equalTo(expectedResult) ); @@ -81,16 +79,15 @@ public static Iterable parameters() { String text = randomWordsBetween(25, 50); ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(Chunk.DEFAULT_CHUNK_SIZE, 0); - List chunks = Chunk.chunkText(text, chunkingSettings, Chunk.DEFAULT_NUM_CHUNKS); + List chunks = Chunk.chunkText(text, chunkingSettings); Object expectedResult = chunks.size() == 1 ? new BytesRef(chunks.get(0).trim()) : chunks.stream().map(s -> new BytesRef(s.trim())).toList(); return new TestCaseSupplier.TestCase( List.of(new TestCaseSupplier.TypedData(new BytesRef(text), DataType.TEXT, "str")), - "ChunkBytesRefEvaluator[str=Attribute[channel=0], numChunks=LiteralsEvaluator[lit=" - + Chunk.DEFAULT_NUM_CHUNKS - + "], chunkingSettings={\"strategy\":\"sentence\",\"max_chunk_size\":300,\"sentence_overlap\":0}]", + "ChunkBytesRefEvaluator[str=Attribute[channel=0], " + + "chunkingSettings={\"strategy\":\"sentence\",\"max_chunk_size\":300,\"sentence_overlap\":0}]", DataType.KEYWORD, equalTo(expectedResult) ); @@ -98,25 +95,17 @@ public static Iterable parameters() { ); } - private static MapExpression createOptions(Integer numChunks, ChunkingSettings chunkingSettings) { - List options = new ArrayList<>(); - - if (Objects.nonNull(numChunks)) { - options.add(Literal.keyword(Source.EMPTY, NUM_CHUNKS)); - options.add(new Literal(Source.EMPTY, numChunks, DataType.INTEGER)); - } + private static MapExpression createChunkingSettings(ChunkingSettings chunkingSettings) { + List chunkingSettingsMap = new ArrayList<>(); if (Objects.nonNull(chunkingSettings)) { - options.add(Literal.keyword(Source.EMPTY, CHUNKING_SETTINGS)); - List chunkingSettingsMap = new ArrayList<>(); chunkingSettings.asMap().forEach((key, value) -> { chunkingSettingsMap.add(Literal.keyword(Source.EMPTY, key)); chunkingSettingsMap.add(new Literal(Source.EMPTY, value, DataType.INTEGER)); }); - options.add(new MapExpression(Source.EMPTY, chunkingSettingsMap)); } - return new MapExpression(Source.EMPTY, options); + return new MapExpression(Source.EMPTY, chunkingSettingsMap); } @Override @@ -128,41 +117,43 @@ protected Expression build(Source source, List args) { public void testDefaults() { // Default of 300 is huge, only one chunk returned in this case - verifyChunks(null, null, 1); + verifyChunks(null, 1); } - public void testDefaultNumChunks() { - ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(20, 0); - verifyChunks(null, chunkingSettings, 8); + public void testDefaultChunkingSettings() { + verifyChunks(null, 1); } - public void testDefaultChunkingSettings() { - int numChunks = 1; // Default of 300 is huge, only one chunk returned in this case - verifyChunks(numChunks, null, numChunks); + public void testSpecifiedChunkingSettings() { + // We can't randomize here, because we're testing on specifically specified chunk size that's variable. + int chunkSize = 25; + int expectedNumChunks = 6; + ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, 0); + verifyChunks(chunkingSettings, expectedNumChunks); } - public void testSpecifiedOptions() { - int numChunks = randomIntBetween(2, 4); - int chunkSize = randomIntBetween(20, 30); - ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSize, randomIntBetween(0, 1)); - verifyChunks(numChunks, chunkingSettings, numChunks); + public void testRandomChunkingSettings() { + ChunkingSettings chunkingSettings = createRandomChunkingSettings(); + List result = process(PARAGRAPH_INPUT, chunkingSettings); + assertNotNull(result); + assertFalse(result.isEmpty()); + // Actual results depend on chunking settings passed in } - private void verifyChunks(Integer numChunks, ChunkingSettings chunkingSettings, int expectedNumChunksReturned) { - int numChunksOrDefault = numChunks != null ? numChunks : Chunk.DEFAULT_NUM_CHUNKS; + private void verifyChunks(ChunkingSettings chunkingSettings, int expectedNumChunksReturned) { ChunkingSettings chunkingSettingsOrDefault = chunkingSettings != null ? chunkingSettings : Chunk.DEFAULT_CHUNKING_SETTINGS; - List expected = Chunk.chunkText(PARAGRAPH_INPUT, chunkingSettingsOrDefault, numChunksOrDefault) + List expected = Chunk.chunkText(PARAGRAPH_INPUT, chunkingSettingsOrDefault) .stream() .map(String::trim) .toList(); - List result = process(PARAGRAPH_INPUT, numChunksOrDefault, chunkingSettingsOrDefault); + List result = process(PARAGRAPH_INPUT, chunkingSettingsOrDefault); assertThat(result.size(), equalTo(expectedNumChunksReturned)); assertThat(result, equalTo(expected)); } - private List process(String str, Integer numChunks, ChunkingSettings chunkingSettings) { - MapExpression optionsMap = (numChunks == null && chunkingSettings == null) ? null : createOptions(numChunks, chunkingSettings); + private List process(String str,ChunkingSettings chunkingSettings) { + MapExpression optionsMap = chunkingSettings == null ? null : createChunkingSettings(chunkingSettings); try ( EvalOperator.ExpressionEvaluator eval = evaluator(new Chunk(Source.EMPTY, field("str", DataType.KEYWORD), optionsMap)).get( From c332086ae334521f97981d3af273c8ebe8e836fb Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 19 Nov 2025 21:13:37 +0000 Subject: [PATCH 10/15] [CI] Auto commit changes from spotless --- .../function/scalar/string/Chunk.java | 40 +++++++++---------- .../xpack/esql/analysis/VerifierTests.java | 6 +-- .../function/scalar/string/ChunkTests.java | 9 ++--- 3 files changed, 24 insertions(+), 31 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 4993f93e02192..1a0b8a5f75f66 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -52,21 +52,28 @@ public class Chunk extends EsqlScalarFunction implements OptionalArgument { private final Expression field, chunkingSettings; - @FunctionInfo(returnType = "keyword", preview = true, description = """ - Use `CHUNK` to split a text field into smaller chunks.""", detailedDescription = """ - Chunk can be used on fields from the text famiy like <> and <>. - Chunk will split a text field into smaller chunks, using a sentence-based chunking strategy. - The number of chunks returned, and the length of the sentences used to create the chunks can be specified. - """, examples = { @Example(file = "chunk", tag = "chunk-with-field", applies_to = "stack: preview 9.3.0"), - @Example(file = "chunk", tag = "chunk-with-chunking-settings", applies_to = "stack: preview 9.3.0") }) + @FunctionInfo( + returnType = "keyword", + preview = true, + description = """ + Use `CHUNK` to split a text field into smaller chunks.""", + detailedDescription = """ + Chunk can be used on fields from the text famiy like <> and <>. + Chunk will split a text field into smaller chunks, using a sentence-based chunking strategy. + The number of chunks returned, and the length of the sentences used to create the chunks can be specified. + """, + examples = { + @Example(file = "chunk", tag = "chunk-with-field", applies_to = "stack: preview 9.3.0"), + @Example(file = "chunk", tag = "chunk-with-chunking-settings", applies_to = "stack: preview 9.3.0") } + ) public Chunk( Source source, @Param(name = "field", type = { "keyword", "text" }, description = "The input to chunk.") Expression field, @MapParam( name = "chunking_settings", - description = "Options to customize chunking behavior. Refer to the " + - "[Inference API documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put" + - "#operation-inference-put-body-application-json-chunking_settings) for valid values for `chunking_settings`.", + description = "Options to customize chunking behavior. Refer to the " + + "[Inference API documentation](https://www.elastic.co/docs/api/doc/elasticsearch/operation/operation-inference-put" + + "#operation-inference-put-body-application-json-chunking_settings) for valid values for `chunking_settings`.", optional = true ) Expression chunkingSettings ) { @@ -116,7 +123,7 @@ protected TypeResolution resolveType() { private TypeResolution validateChunkingSettings() { // Just ensure all keys and values are literals - defer valid chunking settings for validation later - if (chunkingSettings instanceof MapExpression == false) { + if (chunkingSettings instanceof MapExpression == false) { return new TypeResolution("chunking_settings must be a map"); } MapExpression chunkingSettingsMap = (MapExpression) chunkingSettings; @@ -179,10 +186,7 @@ static void process(BytesRefBlock.Builder builder, BytesRef str, @Fixed Chunking public static List chunkText(String content, ChunkingSettings chunkingSettings) { Chunker chunker = ChunkerBuilder.fromChunkingStrategy(chunkingSettings.getChunkingStrategy()); - return chunker.chunk(content, chunkingSettings) - .stream() - .map(offset -> content.substring(offset.start(), offset.end())) - .toList(); + return chunker.chunk(content, chunkingSettings).stream().map(offset -> content.substring(offset.start(), offset.end())).toList(); } @Override @@ -206,11 +210,7 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); } - return new ChunkBytesRefEvaluator.Factory( - source(), - toEvaluator.apply(field), - chunkingSettings - ); + return new ChunkBytesRefEvaluator.Factory(source(), toEvaluator.apply(field), chunkingSettings); } private static Map toMap(MapExpression mapExpr) { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 923ad07c418b2..4317a040398a6 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -3321,11 +3321,7 @@ public void testSubqueryInFromWithLookupJoinOnFullTextFunction() { public void testChunkFunctionInvalidInputs() { if (EsqlCapabilities.Cap.CHUNK_FUNCTION.isEnabled()) { assertThat( - error( - "from test | EVAL chunks = CHUNK(body, null)", - fullTextAnalyzer, - VerificationException.class - ), + error("from test | EVAL chunks = CHUNK(body, null)", fullTextAnalyzer, VerificationException.class), equalTo("1:27: chunking_settings must be a map") ); } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java index c0599e49237d9..c72cf83da258c 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ChunkTests.java @@ -71,7 +71,7 @@ public static Iterable parameters() { return new TestCaseSupplier.TestCase( List.of(new TestCaseSupplier.TypedData(new BytesRef(text), DataType.KEYWORD, "str")), "ChunkBytesRefEvaluator[str=Attribute[channel=0], " - + "chunkingSettings={\"strategy\":\"sentence\",\"max_chunk_size\":300,\"sentence_overlap\":0}]", + + "chunkingSettings={\"strategy\":\"sentence\",\"max_chunk_size\":300,\"sentence_overlap\":0}]", DataType.KEYWORD, equalTo(expectedResult) ); @@ -142,17 +142,14 @@ public void testRandomChunkingSettings() { private void verifyChunks(ChunkingSettings chunkingSettings, int expectedNumChunksReturned) { ChunkingSettings chunkingSettingsOrDefault = chunkingSettings != null ? chunkingSettings : Chunk.DEFAULT_CHUNKING_SETTINGS; - List expected = Chunk.chunkText(PARAGRAPH_INPUT, chunkingSettingsOrDefault) - .stream() - .map(String::trim) - .toList(); + List expected = Chunk.chunkText(PARAGRAPH_INPUT, chunkingSettingsOrDefault).stream().map(String::trim).toList(); List result = process(PARAGRAPH_INPUT, chunkingSettingsOrDefault); assertThat(result.size(), equalTo(expectedNumChunksReturned)); assertThat(result, equalTo(expected)); } - private List process(String str,ChunkingSettings chunkingSettings) { + private List process(String str, ChunkingSettings chunkingSettings) { MapExpression optionsMap = chunkingSettings == null ? null : createChunkingSettings(chunkingSettings); try ( From 7b153b666f191c2db2edce35077871704221978a Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Wed, 19 Nov 2025 16:22:19 -0500 Subject: [PATCH 11/15] Verifier tests --- .../src/main/resources/chunk.csv-spec | 1 - .../function/scalar/string/Chunk.java | 10 ++++-- .../xpack/esql/analysis/VerifierTests.java | 35 +++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index 39220912f42a9..e8070ad40cf2f 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -173,4 +173,3 @@ emp_no:integer | first_name:keyword | last_name:keyword | chunks:keyword - diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index 1a0b8a5f75f66..c5fe1f9af98f4 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -122,8 +122,7 @@ protected TypeResolution resolveType() { } private TypeResolution validateChunkingSettings() { - // Just ensure all keys and values are literals - defer valid chunking settings for validation later - if (chunkingSettings instanceof MapExpression == false) { + if (chunkingSettings instanceof MapExpression == false) { return new TypeResolution("chunking_settings must be a map"); } MapExpression chunkingSettingsMap = (MapExpression) chunkingSettings; @@ -135,6 +134,13 @@ private TypeResolution validateChunkingSettings() { return new TypeResolution("chunking_settings values must be constants"); } } + + try { + ChunkingSettingsBuilder.fromMap(toMap(chunkingSettingsMap)); + } catch (IllegalArgumentException e) { + return new TypeResolution(e.getMessage()); + } + return TypeResolution.TYPE_RESOLVED; } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 4317a040398a6..1389f9070d72f 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -3324,6 +3324,41 @@ public void testChunkFunctionInvalidInputs() { error("from test | EVAL chunks = CHUNK(body, null)", fullTextAnalyzer, VerificationException.class), equalTo("1:27: chunking_settings must be a map") ); + assertThat( + error( + "from test | EVAL chunks = CHUNK(body, {\"strategy\": \"invalid\"})", + fullTextAnalyzer, + VerificationException.class + ), + equalTo("1:27: Invalid chunkingStrategy invalid") + ); + assertThat( + error( + "from test | EVAL chunks = CHUNK(body, {\"strategy\": \"sentence\", \"max_chunk_size\": 5, \"sentence_overlap\": 1})", + fullTextAnalyzer, + VerificationException.class + ), + equalTo("1:27: Validation Failed: 1: [chunking_settings] Invalid value [5.0]. " + + "[max_chunk_size] must be a greater than or equal to [20.0];") + ); + assertThat( + error( + "from test | EVAL chunks = CHUNK(body, {\"strategy\": \"sentence\", \"max_chunk_size\": 5, \"sentence_overlap\": 5})", + fullTextAnalyzer, + VerificationException.class + ), + equalTo("1:27: Validation Failed: 1: [chunking_settings] Invalid value [5.0]. " + + "[max_chunk_size] must be a greater than or equal to [20.0];2: sentence_overlap[5] must be either 0 or 1;") + ); + assertThat( + error( + "from test | EVAL chunks = CHUNK(body, {\"strategy\": \"sentence\", \"max_chunk_size\": 20, " + + "\"sentence_overlap\": 1, \"extra_value\": \"foo\"})", + fullTextAnalyzer, + VerificationException.class + ), + equalTo("1:27: Validation Failed: 1: Sentence based chunking settings can not have the following settings: [extra_value];") + ); } } From 38cc69b5b69a708a27cde9da25cf19fe0afcb9a8 Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Wed, 19 Nov 2025 16:24:56 -0500 Subject: [PATCH 12/15] Update docs/changelog/138123.yaml --- docs/changelog/138123.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/138123.yaml diff --git a/docs/changelog/138123.yaml b/docs/changelog/138123.yaml new file mode 100644 index 0000000000000..c68e47500ffe0 --- /dev/null +++ b/docs/changelog/138123.yaml @@ -0,0 +1,5 @@ +pr: 138123 +summary: ES|QL Update CHUNK to support `chunking_settings` as optional argument +area: ES|QL +type: enhancement +issues: [] From bcb72258f5960b0a232dfdbbf193015cc68bb35f Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Wed, 19 Nov 2025 21:33:07 +0000 Subject: [PATCH 13/15] [CI] Auto commit changes from spotless --- .../function/scalar/string/Chunk.java | 2 +- .../xpack/esql/analysis/VerifierTests.java | 22 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index c5fe1f9af98f4..fd20628651d33 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -122,7 +122,7 @@ protected TypeResolution resolveType() { } private TypeResolution validateChunkingSettings() { - if (chunkingSettings instanceof MapExpression == false) { + if (chunkingSettings instanceof MapExpression == false) { return new TypeResolution("chunking_settings must be a map"); } MapExpression chunkingSettingsMap = (MapExpression) chunkingSettings; diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 1389f9070d72f..95b1bca86855b 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -3325,11 +3325,7 @@ public void testChunkFunctionInvalidInputs() { equalTo("1:27: chunking_settings must be a map") ); assertThat( - error( - "from test | EVAL chunks = CHUNK(body, {\"strategy\": \"invalid\"})", - fullTextAnalyzer, - VerificationException.class - ), + error("from test | EVAL chunks = CHUNK(body, {\"strategy\": \"invalid\"})", fullTextAnalyzer, VerificationException.class), equalTo("1:27: Invalid chunkingStrategy invalid") ); assertThat( @@ -3338,8 +3334,10 @@ public void testChunkFunctionInvalidInputs() { fullTextAnalyzer, VerificationException.class ), - equalTo("1:27: Validation Failed: 1: [chunking_settings] Invalid value [5.0]. " + - "[max_chunk_size] must be a greater than or equal to [20.0];") + equalTo( + "1:27: Validation Failed: 1: [chunking_settings] Invalid value [5.0]. " + + "[max_chunk_size] must be a greater than or equal to [20.0];" + ) ); assertThat( error( @@ -3347,13 +3345,15 @@ public void testChunkFunctionInvalidInputs() { fullTextAnalyzer, VerificationException.class ), - equalTo("1:27: Validation Failed: 1: [chunking_settings] Invalid value [5.0]. " + - "[max_chunk_size] must be a greater than or equal to [20.0];2: sentence_overlap[5] must be either 0 or 1;") + equalTo( + "1:27: Validation Failed: 1: [chunking_settings] Invalid value [5.0]. " + + "[max_chunk_size] must be a greater than or equal to [20.0];2: sentence_overlap[5] must be either 0 or 1;" + ) ); assertThat( error( - "from test | EVAL chunks = CHUNK(body, {\"strategy\": \"sentence\", \"max_chunk_size\": 20, " + - "\"sentence_overlap\": 1, \"extra_value\": \"foo\"})", + "from test | EVAL chunks = CHUNK(body, {\"strategy\": \"sentence\", \"max_chunk_size\": 20, " + + "\"sentence_overlap\": 1, \"extra_value\": \"foo\"})", fullTextAnalyzer, VerificationException.class ), From fbf8ffdd79a86e61eee9c5cd1eaa4206e34536fa Mon Sep 17 00:00:00 2001 From: Kathleen DeRusso Date: Thu, 20 Nov 2025 15:27:48 -0500 Subject: [PATCH 14/15] PR Feedback --- .../esql/_snippets/functions/layout/chunk.md | 4 ++ .../src/main/resources/chunk.csv-spec | 14 ++--- .../xpack/esql/action/EsqlCapabilities.java | 2 +- .../function/scalar/string/Chunk.java | 60 +++++++++---------- .../elasticsearch/xpack/esql/CsvTests.java | 2 +- .../xpack/esql/analysis/VerifierTests.java | 2 +- 6 files changed, 44 insertions(+), 40 deletions(-) diff --git a/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md b/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md index a3e67be49499a..174db24b5949b 100644 --- a/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md +++ b/docs/reference/query-languages/esql/_snippets/functions/layout/chunk.md @@ -1,6 +1,10 @@ % This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. ## `CHUNK` [esql-chunk] +```{applies_to} +stack: preview 9.3.0 +serverless: preview +``` **Syntax** diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec index e8070ad40cf2f..2ae6ce51555c0 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/chunk.csv-spec @@ -3,7 +3,7 @@ # chunkDefaults -required_capability: chunk_function +required_capability: chunk_function_v2 // tag::chunk-with-field[] FROM books @@ -25,7 +25,7 @@ book_no:keyword | title:text | chunks ; chunkWithChunkingSettings -required_capability: chunk_function +required_capability: chunk_function_v2 // tag::chunk-with-chunking-settings[] FROM books @@ -47,7 +47,7 @@ book_no:keyword | title:text | chunks ; chunkTextWithMatch -required_capability: chunk_function +required_capability: chunk_function_v2 FROM books | WHERE MATCH(title, "Return") @@ -61,7 +61,7 @@ book_no:keyword | title:text ; chunkTextWithMatchMultipleChunksMvExpand -required_capability: chunk_function +required_capability: chunk_function_v2 FROM books | WHERE MATCH(title, "Return") @@ -108,7 +108,7 @@ book_no:keyword | title:text ; chunkTextWithMatchMultipleChunksMvSliceMvExpand -required_capability: chunk_function +required_capability: chunk_function_v2 FROM books | WHERE MATCH(title, "Return") @@ -131,7 +131,7 @@ book_no:keyword | title:text chunkTextWithConcatenatedField -required_capability: chunk_function +required_capability: chunk_function_v2 FROM books | EVAL title_description = CONCAT(title, " ", description) @@ -150,7 +150,7 @@ book_no:keyword | title:text | chunks ; chunkTextWithMultivaluedField -required_capability: chunk_function +required_capability: chunk_function_v2 FROM employees | EVAL chunks = CHUNK(job_positions) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 6497ddfc6afbf..7f051e8c7d9df 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -1658,7 +1658,7 @@ public enum Cap { /** * Chunk function. */ - CHUNK_FUNCTION(Build.current().isSnapshot()), + CHUNK_FUNCTION_V2(Build.current().isSnapshot()), /** * Support for vector similarity functtions pushdown diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index fd20628651d33..d171c85236278 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -20,14 +20,16 @@ import org.elasticsearch.xpack.core.inference.chunking.ChunkerBuilder; import org.elasticsearch.xpack.core.inference.chunking.ChunkingSettingsBuilder; import org.elasticsearch.xpack.core.inference.chunking.SentenceBoundaryChunkingSettings; -import org.elasticsearch.xpack.esql.core.expression.EntryExpression; import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.FoldContext; import org.elasticsearch.xpack.esql.core.expression.Literal; import org.elasticsearch.xpack.esql.core.expression.MapExpression; import org.elasticsearch.xpack.esql.core.tree.NodeInfo; import org.elasticsearch.xpack.esql.core.tree.Source; import org.elasticsearch.xpack.esql.core.type.DataType; import org.elasticsearch.xpack.esql.expression.function.Example; +import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesTo; +import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesToLifecycle; import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; import org.elasticsearch.xpack.esql.expression.function.MapParam; import org.elasticsearch.xpack.esql.expression.function.OptionalArgument; @@ -39,6 +41,7 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.stream.Collectors; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.ParamOrdinal.FIRST; import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; @@ -53,6 +56,7 @@ public class Chunk extends EsqlScalarFunction implements OptionalArgument { private final Expression field, chunkingSettings; @FunctionInfo( + appliesTo = { @FunctionAppliesTo(lifeCycle = FunctionAppliesToLifecycle.PREVIEW, version = "9.3.0") }, returnType = "keyword", preview = true, description = """ @@ -113,30 +117,31 @@ protected TypeResolution resolveType() { return new TypeResolution("Unresolved children"); } - TypeResolution fieldResolution = isString(field(), sourceText(), FIRST); - if (fieldResolution.unresolved()) { - return fieldResolution; - } - - return chunkingSettings == null ? TypeResolution.TYPE_RESOLVED : validateChunkingSettings(); + return isString(field(), sourceText(), FIRST).and(this::validateChunkingSettings); } private TypeResolution validateChunkingSettings() { + if (chunkingSettings == null) { + return TypeResolution.TYPE_RESOLVED; + } if (chunkingSettings instanceof MapExpression == false) { return new TypeResolution("chunking_settings must be a map"); } MapExpression chunkingSettingsMap = (MapExpression) chunkingSettings; - for (EntryExpression entry : chunkingSettingsMap.entryExpressions()) { - if (entry.key() instanceof Literal == false || (entry.key()).foldable() == false) { - return new TypeResolution("chunking_settings keys must be constants"); - } - if (entry.value() instanceof Literal == false || (entry.value()).foldable() == false) { - return new TypeResolution("chunking_settings values must be constants"); - } + var errors = chunkingSettingsMap.keyFoldedMap() + .entrySet() + .stream() + .filter(e -> e.getValue() instanceof Literal == false) + .map(e -> "invalid option for [" + e.getKey() + "], expected a constant, found [" + + e.getValue().dataType() + "]") + .toList(); + + if (errors.isEmpty() == false) { + return new TypeResolution(String.join("; ", errors)); } try { - ChunkingSettingsBuilder.fromMap(toMap(chunkingSettingsMap)); + toChunkingSettings(chunkingSettingsMap); } catch (IllegalArgumentException e) { return new TypeResolution(e.getMessage()); } @@ -212,25 +217,20 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua ChunkingSettings chunkingSettings = DEFAULT_CHUNKING_SETTINGS; if (chunkingSettings() != null) { - Map chunkingSettingsMap = toMap((MapExpression) chunkingSettings()); - chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); + chunkingSettings = toChunkingSettings((MapExpression) chunkingSettings()); } return new ChunkBytesRefEvaluator.Factory(source(), toEvaluator.apply(field), chunkingSettings); } - private static Map toMap(MapExpression mapExpr) { - Map result = new java.util.HashMap<>(); - for (EntryExpression entry : mapExpr.entryExpressions()) { - Object keyValue = ((Literal) entry.key()).value(); - String key = keyValue instanceof BytesRef br ? br.utf8ToString() : keyValue.toString(); - - Object value = ((Literal) entry.value()).value(); - if (value instanceof BytesRef br) { - value = br.utf8ToString(); - } - result.put(key, value); - } - return result; + private static ChunkingSettings toChunkingSettings(MapExpression map) { + Map chunkingSettingsMap = map.keyFoldedMap() + .entrySet() + .stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> { + Object value = e.getValue().fold(FoldContext.small()); + return value instanceof BytesRef ? ((BytesRef) value).utf8ToString() : value; + })); + return ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); } } diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java index fd7cbfb6fa723..0c0d05fc13119 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/CsvTests.java @@ -357,7 +357,7 @@ public final void test() throws Throwable { ); assumeFalse( "CSV tests cannot currently handle CHUNK function", - testCase.requiredCapabilities.contains(EsqlCapabilities.Cap.CHUNK_FUNCTION.capabilityName()) + testCase.requiredCapabilities.contains(EsqlCapabilities.Cap.CHUNK_FUNCTION_V2.capabilityName()) ); if (Build.current().isSnapshot()) { diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java index 95b1bca86855b..bd9fd2e2df90c 100644 --- a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/analysis/VerifierTests.java @@ -3319,7 +3319,7 @@ public void testSubqueryInFromWithLookupJoinOnFullTextFunction() { } public void testChunkFunctionInvalidInputs() { - if (EsqlCapabilities.Cap.CHUNK_FUNCTION.isEnabled()) { + if (EsqlCapabilities.Cap.CHUNK_FUNCTION_V2.isEnabled()) { assertThat( error("from test | EVAL chunks = CHUNK(body, null)", fullTextAnalyzer, VerificationException.class), equalTo("1:27: chunking_settings must be a map") From 06fedb7d3d0453949a25b0887b24928991c216df Mon Sep 17 00:00:00 2001 From: elasticsearchmachine Date: Thu, 20 Nov 2025 20:35:37 +0000 Subject: [PATCH 15/15] [CI] Auto commit changes from spotless --- .../expression/function/scalar/string/Chunk.java | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java index d171c85236278..36cc3a10baa27 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/Chunk.java @@ -132,8 +132,7 @@ private TypeResolution validateChunkingSettings() { .entrySet() .stream() .filter(e -> e.getValue() instanceof Literal == false) - .map(e -> "invalid option for [" + e.getKey() + "], expected a constant, found [" + - e.getValue().dataType() + "]") + .map(e -> "invalid option for [" + e.getKey() + "], expected a constant, found [" + e.getValue().dataType() + "]") .toList(); if (errors.isEmpty() == false) { @@ -224,13 +223,10 @@ public EvalOperator.ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvalua } private static ChunkingSettings toChunkingSettings(MapExpression map) { - Map chunkingSettingsMap = map.keyFoldedMap() - .entrySet() - .stream() - .collect(Collectors.toMap(Map.Entry::getKey, e -> { - Object value = e.getValue().fold(FoldContext.small()); - return value instanceof BytesRef ? ((BytesRef) value).utf8ToString() : value; - })); + Map chunkingSettingsMap = map.keyFoldedMap().entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, e -> { + Object value = e.getValue().fold(FoldContext.small()); + return value instanceof BytesRef ? ((BytesRef) value).utf8ToString() : value; + })); return ChunkingSettingsBuilder.fromMap(chunkingSettingsMap); } }