diff --git a/docs/reference/query-languages/esql/_snippets/functions/description/to_ascii.md b/docs/reference/query-languages/esql/_snippets/functions/description/to_ascii.md new file mode 100644 index 0000000000000..07890b8de106a --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/description/to_ascii.md @@ -0,0 +1,6 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Description** + +Escape non ASCII characters. + diff --git a/docs/reference/query-languages/esql/_snippets/functions/examples/to_ascii.md b/docs/reference/query-languages/esql/_snippets/functions/examples/to_ascii.md new file mode 100644 index 0000000000000..e801695cb4421 --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/examples/to_ascii.md @@ -0,0 +1,13 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Example** + +```esql +ROW a = "Hello\\n\\t 世界! 🌍 Café naïve résumé こんにちは 🎉 中文测试 αβγδε 日本語テスト 🚀🔥💧🪨" | EVAL x = TO_ASCII(a) | KEEP x; +``` + +| x:keyword | +| --- | +| Hello\\n\\t \\u4e16\\u754c! \\U0001f30d Caf\\xe9 na\\xefve r\\xe9sum\\xe9 \\u3053\\u3093\\u306b\\u3061\\u306f \\U0001f389 \\u4e2d\\u6587\\u6d4b\\u8bd5 \\u03b1\\u03b2\\u03b3\\u03b4\\u03b5 \\u65e5\\u672c\\u8a9e\\u30c6\\u30b9\\u30c8 \\U0001f680\\U0001f525\\U0001f4a7\\U0001faa8 | + + diff --git a/docs/reference/query-languages/esql/_snippets/functions/layout/to_ascii.md b/docs/reference/query-languages/esql/_snippets/functions/layout/to_ascii.md new file mode 100644 index 0000000000000..aa08b9f17dc23 --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/layout/to_ascii.md @@ -0,0 +1,26 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +## `TO_ASCII` [esql-to_ascii] +```{applies_to} +stack: ga 9.2.0 +``` + +**Syntax** + +:::{image} ../../../images/functions/to_ascii.svg +:alt: Embedded +:class: text-center +::: + + +:::{include} ../parameters/to_ascii.md +::: + +:::{include} ../description/to_ascii.md +::: + +:::{include} ../types/to_ascii.md +::: + +:::{include} ../examples/to_ascii.md +::: diff --git a/docs/reference/query-languages/esql/_snippets/functions/parameters/to_ascii.md b/docs/reference/query-languages/esql/_snippets/functions/parameters/to_ascii.md new file mode 100644 index 0000000000000..c0b0fc4761d7a --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/parameters/to_ascii.md @@ -0,0 +1,7 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Parameters** + +`string` +: String expression. If `null`, the function returns `null`. + diff --git a/docs/reference/query-languages/esql/_snippets/functions/types/to_ascii.md b/docs/reference/query-languages/esql/_snippets/functions/types/to_ascii.md new file mode 100644 index 0000000000000..7221b9139e2b8 --- /dev/null +++ b/docs/reference/query-languages/esql/_snippets/functions/types/to_ascii.md @@ -0,0 +1,9 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +**Supported types** + +| string | result | +| --- | --- | +| keyword | keyword | +| text | keyword | + diff --git a/docs/reference/query-languages/esql/_snippets/lists/string-functions.md b/docs/reference/query-languages/esql/_snippets/lists/string-functions.md index ac5f0a327fadb..7bec3b758f9f4 100644 --- a/docs/reference/query-languages/esql/_snippets/lists/string-functions.md +++ b/docs/reference/query-languages/esql/_snippets/lists/string-functions.md @@ -1,3 +1,4 @@ +* [`TO_ASCII`](../../functions-operators/string-functions.md#esql-to_ascii) * [`BIT_LENGTH`](../../functions-operators/string-functions.md#esql-bit_length) * [`BYTE_LENGTH`](../../functions-operators/string-functions.md#esql-byte_length) * [`CONCAT`](../../functions-operators/string-functions.md#esql-concat) diff --git a/docs/reference/query-languages/esql/functions-operators/string-functions.md b/docs/reference/query-languages/esql/functions-operators/string-functions.md index 6b887bd13212d..309eacb50e30b 100644 --- a/docs/reference/query-languages/esql/functions-operators/string-functions.md +++ b/docs/reference/query-languages/esql/functions-operators/string-functions.md @@ -14,6 +14,8 @@ mapped_pages: :::{include} ../_snippets/lists/string-functions.md ::: +:::{include} ../_snippets/functions/layout/to_ascii.md +::: :::{include} ../_snippets/functions/layout/bit_length.md ::: diff --git a/docs/reference/query-languages/esql/images/functions/to_ascii.svg b/docs/reference/query-languages/esql/images/functions/to_ascii.svg new file mode 100644 index 0000000000000..c7f20de3c671f --- /dev/null +++ b/docs/reference/query-languages/esql/images/functions/to_ascii.svg @@ -0,0 +1 @@ +TO_ASCII(string) \ No newline at end of file diff --git a/docs/reference/query-languages/esql/kibana/definition/functions/to_ascii.json b/docs/reference/query-languages/esql/kibana/definition/functions/to_ascii.json new file mode 100644 index 0000000000000..c1140fd00b68c --- /dev/null +++ b/docs/reference/query-languages/esql/kibana/definition/functions/to_ascii.json @@ -0,0 +1,37 @@ +{ + "comment" : "This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it.", + "type" : "scalar", + "name" : "to_ascii", + "description" : "Escape non ASCII characters.", + "signatures" : [ + { + "params" : [ + { + "name" : "string", + "type" : "keyword", + "optional" : false, + "description" : "String expression. If `null`, the function returns `null`." + } + ], + "variadic" : false, + "returnType" : "keyword" + }, + { + "params" : [ + { + "name" : "string", + "type" : "text", + "optional" : false, + "description" : "String expression. If `null`, the function returns `null`." + } + ], + "variadic" : false, + "returnType" : "keyword" + } + ], + "examples" : [ + "ROW a = \"Hello\\\\n\\\\t 世界! \uD83C\uDF0D Café naïve résumé こんにちは \uD83C\uDF89 中文测试 αβγδε 日本語テスト \uD83D\uDE80\uD83D\uDD25\uD83D\uDCA7\uD83E\uDEA8\" | EVAL x = TO_ASCII(a) | KEEP x;" + ], + "preview" : false, + "snapshot_only" : false +} diff --git a/docs/reference/query-languages/esql/kibana/docs/functions/to_ascii.md b/docs/reference/query-languages/esql/kibana/docs/functions/to_ascii.md new file mode 100644 index 0000000000000..a12aa975ba128 --- /dev/null +++ b/docs/reference/query-languages/esql/kibana/docs/functions/to_ascii.md @@ -0,0 +1,8 @@ +% This is generated by ESQL's AbstractFunctionTestCase. Do not edit it. See ../README.md for how to regenerate it. + +### TO ASCII +Escape non ASCII characters. + +```esql +ROW a = "Hello\\n\\t 世界! 🌍 Café naïve résumé こんにちは 🎉 中文测试 αβγδε 日本語テスト 🚀🔥💧🪨" | EVAL x = TO_ASCII(a) | KEEP x; +``` diff --git a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec index 4d9ddb83ae301..239f6648e6242 100644 --- a/x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec +++ b/x-pack/plugin/esql/qa/testFixtures/src/main/resources/string.csv-spec @@ -2748,3 +2748,14 @@ book_no:keyword | author_encoded:keyword | title_encoded:keyword 1463 | J.%20R.%20R.%20Tolkien | Realms%20of%20Tolkien%3A%20Images%20of%20Middle-earth ; +to_ascii +required_capability: to_ascii +// tag::to_ascii[] +ROW a = "Hello\n\t 世界! 🌍 Café naïve résumé こんにちは 🎉 中文测试 αβγδε 日本語テスト 🚀🔥💧🪨" | EVAL x = TO_ASCII(a) | KEEP x; +// end::to_ascii[] + +// tag::to_ascii-result[] +x:keyword +Hello\\n\\t \\u4e16\\u754c! \\U0001f30d Caf\\xe9 na\\xefve r\\xe9sum\\xe9 \\u3053\\u3093\\u306b\\u3061\\u306f \\U0001f389 \\u4e2d\\u6587\\u6d4b\\u8bd5 \\u03b1\\u03b2\\u03b3\\u03b4\\u03b5 \\u65e5\\u672c\\u8a9e\\u30c6\\u30b9\\u30c8 \\U0001f680\\U0001f525\\U0001f4a7\\U0001faa8 +// end::to_ascii-result[] +; diff --git a/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAsciiEvaluator.java b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAsciiEvaluator.java new file mode 100644 index 0000000000000..1a89a0ab38bcb --- /dev/null +++ b/x-pack/plugin/esql/src/main/generated/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAsciiEvaluator.java @@ -0,0 +1,172 @@ +// Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one +// or more contributor license agreements. Licensed under the Elastic License +// 2.0; you may not use this file except in compliance with the Elastic License +// 2.0. +package org.elasticsearch.xpack.esql.expression.function.scalar.string; + +import java.lang.Override; +import java.lang.String; +import java.util.function.Function; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.UnicodeUtil; +import org.elasticsearch.compute.data.Block; +import org.elasticsearch.compute.data.BytesRefBlock; +import org.elasticsearch.compute.data.BytesRefVector; +import org.elasticsearch.compute.data.IntVector; +import org.elasticsearch.compute.data.OrdinalBytesRefVector; +import org.elasticsearch.compute.data.Vector; +import org.elasticsearch.compute.operator.BreakingBytesRefBuilder; +import org.elasticsearch.compute.operator.DriverContext; +import org.elasticsearch.compute.operator.EvalOperator; +import org.elasticsearch.core.Releasables; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.expression.function.scalar.convert.AbstractConvertFunction; + +/** + * {@link EvalOperator.ExpressionEvaluator} implementation for {@link ToAscii}. + * This class is generated. Edit {@code ConvertEvaluatorImplementer} instead. + */ +public final class ToAsciiEvaluator extends AbstractConvertFunction.AbstractEvaluator { + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(ToAsciiEvaluator.class); + + private final EvalOperator.ExpressionEvaluator val; + + private final BreakingBytesRefBuilder scratch; + + private final UnicodeUtil.UTF8CodePoint codePoint; + + public ToAsciiEvaluator(Source source, EvalOperator.ExpressionEvaluator val, + BreakingBytesRefBuilder scratch, UnicodeUtil.UTF8CodePoint codePoint, + DriverContext driverContext) { + super(driverContext, source); + this.val = val; + this.scratch = scratch; + this.codePoint = codePoint; + } + + @Override + public EvalOperator.ExpressionEvaluator next() { + return val; + } + + @Override + public Block evalVector(Vector v) { + BytesRefVector vector = (BytesRefVector) v; + OrdinalBytesRefVector ordinals = vector.asOrdinals(); + if (ordinals != null) { + return evalOrdinals(ordinals); + } + int positionCount = v.getPositionCount(); + BytesRef scratchPad = new BytesRef(); + if (vector.isConstant()) { + return driverContext.blockFactory().newConstantBytesRefBlockWith(evalValue(vector, 0, scratchPad), positionCount); + } + try (BytesRefBlock.Builder builder = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { + for (int p = 0; p < positionCount; p++) { + builder.appendBytesRef(evalValue(vector, p, scratchPad)); + } + return builder.build(); + } + } + + private BytesRef evalValue(BytesRefVector container, int index, BytesRef scratchPad) { + BytesRef value = container.getBytesRef(index, scratchPad); + return ToAscii.process(value, this.scratch, this.codePoint); + } + + @Override + public Block evalBlock(Block b) { + BytesRefBlock block = (BytesRefBlock) b; + int positionCount = block.getPositionCount(); + try (BytesRefBlock.Builder builder = driverContext.blockFactory().newBytesRefBlockBuilder(positionCount)) { + BytesRef scratchPad = new BytesRef(); + for (int p = 0; p < positionCount; p++) { + int valueCount = block.getValueCount(p); + int start = block.getFirstValueIndex(p); + int end = start + valueCount; + boolean positionOpened = false; + boolean valuesAppended = false; + for (int i = start; i < end; i++) { + BytesRef value = evalValue(block, i, scratchPad); + if (positionOpened == false && valueCount > 1) { + builder.beginPositionEntry(); + positionOpened = true; + } + builder.appendBytesRef(value); + valuesAppended = true; + } + if (valuesAppended == false) { + builder.appendNull(); + } else if (positionOpened) { + builder.endPositionEntry(); + } + } + return builder.build(); + } + } + + private BytesRef evalValue(BytesRefBlock container, int index, BytesRef scratchPad) { + BytesRef value = container.getBytesRef(index, scratchPad); + return ToAscii.process(value, this.scratch, this.codePoint); + } + + private Block evalOrdinals(OrdinalBytesRefVector v) { + int positionCount = v.getDictionaryVector().getPositionCount(); + BytesRef scratchPad = new BytesRef(); + try (BytesRefVector.Builder builder = driverContext.blockFactory().newBytesRefVectorBuilder(positionCount)) { + for (int p = 0; p < positionCount; p++) { + builder.appendBytesRef(evalValue(v.getDictionaryVector(), p, scratchPad)); + } + IntVector ordinals = v.getOrdinalsVector(); + ordinals.incRef(); + return new OrdinalBytesRefVector(ordinals, builder.build()).asBlock(); + } + } + + @Override + public String toString() { + return "ToAsciiEvaluator[" + "val=" + val + "]"; + } + + @Override + public void close() { + Releasables.closeExpectNoException(val, scratch); + } + + @Override + public long baseRamBytesUsed() { + long baseRamBytesUsed = BASE_RAM_BYTES_USED; + baseRamBytesUsed += val.baseRamBytesUsed(); + return baseRamBytesUsed; + } + + public static class Factory implements EvalOperator.ExpressionEvaluator.Factory { + private final Source source; + + private final EvalOperator.ExpressionEvaluator.Factory val; + + private final Function scratch; + + private final Function codePoint; + + public Factory(Source source, EvalOperator.ExpressionEvaluator.Factory val, + Function scratch, + Function codePoint) { + this.source = source; + this.val = val; + this.scratch = scratch; + this.codePoint = codePoint; + } + + @Override + public ToAsciiEvaluator get(DriverContext context) { + return new ToAsciiEvaluator(source, val.get(context), scratch.apply(context), codePoint.apply(context), context); + } + + @Override + public String toString() { + return "ToAsciiEvaluator[" + "val=" + val + "]"; + } + } +} diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java index 3acbb4d36899e..928cfbba97c71 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/action/EsqlCapabilities.java @@ -1529,7 +1529,12 @@ public enum Cap { /** * Fix double release in inline stats when LocalRelation is reused */ - INLINE_STATS_DOUBLE_RELEASE_FIX(INLINESTATS_V11.enabled) + INLINE_STATS_DOUBLE_RELEASE_FIX(INLINESTATS_V11.enabled), + + /** + * Support for string function TO_ASCII + */ + TO_ASCII ; diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java index 2f4d72338b4fc..dc576ae16fafc 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/EsqlFunctionRegistry.java @@ -197,6 +197,7 @@ import org.elasticsearch.xpack.esql.expression.function.scalar.string.Split; import org.elasticsearch.xpack.esql.expression.function.scalar.string.StartsWith; import org.elasticsearch.xpack.esql.expression.function.scalar.string.Substring; +import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToAscii; import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToLower; import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToUpper; import org.elasticsearch.xpack.esql.expression.function.scalar.string.Trim; @@ -397,6 +398,7 @@ private static FunctionDefinition[][] functions() { def(Tau.class, Tau::new, "tau") }, // string new FunctionDefinition[] { + def(ToAscii.class, ToAscii::new, "to_ascii"), def(BitLength.class, BitLength::new, "bit_length"), def(ByteLength.class, ByteLength::new, "byte_length"), def(Concat.class, Concat::new, "concat"), diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/ScalarFunctionWritables.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/ScalarFunctionWritables.java index 961d577692aa0..99faa1d372a99 100644 --- a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/ScalarFunctionWritables.java +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/ScalarFunctionWritables.java @@ -53,6 +53,7 @@ import org.elasticsearch.xpack.esql.expression.function.scalar.string.Split; import org.elasticsearch.xpack.esql.expression.function.scalar.string.StartsWith; import org.elasticsearch.xpack.esql.expression.function.scalar.string.Substring; +import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToAscii; import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToLower; import org.elasticsearch.xpack.esql.expression.function.scalar.string.ToUpper; import org.elasticsearch.xpack.esql.expression.predicate.logical.And; @@ -67,6 +68,7 @@ public class ScalarFunctionWritables { public static List getNamedWriteables() { List entries = new ArrayList<>(); entries.add(And.ENTRY); + entries.add(ToAscii.ENTRY); entries.add(Atan2.ENTRY); entries.add(BitLength.ENTRY); entries.add(Case.ENTRY); diff --git a/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAscii.java b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAscii.java new file mode 100644 index 0000000000000..9a0f9c8dc222a --- /dev/null +++ b/x-pack/plugin/esql/src/main/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAscii.java @@ -0,0 +1,169 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function.scalar.string; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.io.stream.NamedWriteableRegistry; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.compute.ann.ConvertEvaluator; +import org.elasticsearch.compute.ann.Fixed; +import org.elasticsearch.compute.operator.BreakingBytesRefBuilder; +import org.elasticsearch.compute.operator.EvalOperator.ExpressionEvaluator; +import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.expression.TypeResolutions; +import org.elasticsearch.xpack.esql.core.tree.NodeInfo; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.esql.expression.function.Example; +import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesTo; +import org.elasticsearch.xpack.esql.expression.function.FunctionAppliesToLifecycle; +import org.elasticsearch.xpack.esql.expression.function.FunctionInfo; +import org.elasticsearch.xpack.esql.expression.function.Param; +import org.elasticsearch.xpack.esql.expression.function.scalar.UnaryScalarFunction; + +import java.io.IOException; +import java.util.List; + +import static org.elasticsearch.compute.ann.Fixed.Scope.THREAD_LOCAL; +import static org.elasticsearch.xpack.esql.core.expression.TypeResolutions.isString; + +/** + * Escape non ASCII characters + */ +public final class ToAscii extends UnaryScalarFunction { + public static final NamedWriteableRegistry.Entry ENTRY = new NamedWriteableRegistry.Entry(Expression.class, "ToAscii", ToAscii::new); + + @FunctionInfo( + returnType = { "keyword" }, + description = "Escape non ASCII characters.", + examples = @Example(file = "string", tag = "to_ascii"), + appliesTo = { @FunctionAppliesTo(lifeCycle = FunctionAppliesToLifecycle.GA, version = "9.2.0") } + ) + public ToAscii( + Source source, + @Param( + name = "string", + type = { "keyword", "text" }, + description = "String expression. If `null`, the function returns `null`." + ) Expression str + ) { + super(source, str); + } + + private ToAscii(StreamInput in) throws IOException { + super(in); + } + + @Override + public DataType dataType() { + return DataType.KEYWORD; + } + + @Override + public String getWriteableName() { + return ENTRY.name; + } + + @Override + protected TypeResolution resolveType() { + if (childrenResolved() == false) { + return new TypeResolution("Unresolved children"); + } + + return isString(field, sourceText(), TypeResolutions.ParamOrdinal.DEFAULT); + } + + @Override + public ExpressionEvaluator.Factory toEvaluator(ToEvaluator toEvaluator) { + var field = toEvaluator.apply(field()); + return new ToAsciiEvaluator.Factory( + source(), + field, + context -> new BreakingBytesRefBuilder(context.breaker(), "to_ascii"), + context -> new UnicodeUtil.UTF8CodePoint() + ); + } + + @Override + public Expression replaceChildren(List newChildren) { + return new ToAscii(source(), newChildren.get(0)); + } + + @Override + protected NodeInfo info() { + return NodeInfo.create(this, ToAscii::new, field()); + } + + @ConvertEvaluator + static BytesRef process( + BytesRef val, + @Fixed(includeInToString = false, scope = THREAD_LOCAL) BreakingBytesRefBuilder scratch, + @Fixed(includeInToString = false, scope = THREAD_LOCAL) UnicodeUtil.UTF8CodePoint codePoint + ) { + // Pre-reserve at least as much as the input. + scratch.grow(val.length); + scratch.clear(); + + // The second pass fills in the escaped values + int offset = val.offset; + while (offset < val.offset + val.length) { + codePoint = UnicodeUtil.codePointAt(val.bytes, offset, codePoint); + var code = codePoint.codePoint; + + BytesRef input = new BytesRef(val.bytes, offset, codePoint.numBytes); + + // Bump offset so continue can be used starting from this point + offset += codePoint.numBytes; + + // Check for special ASCII control characters + String escapeStr = switch (code) { + case '\n' -> "\\\\n"; + case '\r' -> "\\\\r"; + case '\t' -> "\\\\t"; + case '\b' -> "\\\\b"; + case '\f' -> "\\\\f"; + case '\\' -> "\\\\\\\\"; + case '\'' -> "\\\\'"; + case '\"' -> "\\\\\""; + default -> null; + }; + + // Printable ASCII characters (32-126) don't need escaping + if (escapeStr == null && code >= 32 && code <= 126) { + scratch.append(input); + continue; + } + + // For any other, we use escaped templates depending on the range + if (escapeStr == null) { + String formatStr; + + if (code < 128) { + formatStr = "\\\\x%02x"; + } else if (code <= 0xFF) { + // Use xHH for code points 128-255 + formatStr = "\\\\x%02x"; + } else if (code <= 0xFFFF) { + // Use uHHHH for code points 256-65535 + formatStr = "\\\\u%04x"; + } else { + // Use UHHHHHHHH for code points above 65535 + formatStr = "\\\\U%08x"; + } + + escapeStr = Strings.format(formatStr, code); + } + + scratch.append(new BytesRef(escapeStr)); + } + + return scratch.bytesRefView(); + } +} diff --git a/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAsciiTests.java b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAsciiTests.java new file mode 100644 index 0000000000000..65bf77ab846c0 --- /dev/null +++ b/x-pack/plugin/esql/src/test/java/org/elasticsearch/xpack/esql/expression/function/scalar/string/ToAsciiTests.java @@ -0,0 +1,171 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0; you may not use this file except in compliance with the Elastic License + * 2.0. + */ + +package org.elasticsearch.xpack.esql.expression.function.scalar.string; + +import com.carrotsearch.randomizedtesting.annotations.Name; +import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; + +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.xpack.esql.core.expression.Expression; +import org.elasticsearch.xpack.esql.core.tree.Source; +import org.elasticsearch.xpack.esql.core.type.DataType; +import org.elasticsearch.xpack.esql.expression.function.AbstractScalarFunctionTestCase; +import org.elasticsearch.xpack.esql.expression.function.TestCaseSupplier; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.Supplier; + +import static org.hamcrest.Matchers.equalTo; + +public class ToAsciiTests extends AbstractScalarFunctionTestCase { + public ToAsciiTests(@Name("TestCase") Supplier testCaseSupplier) { + this.testCase = testCaseSupplier.get(); + } + + @ParametersFactory + public static Iterable parameters() { + + List cases = new ArrayList<>(); + + // Test with ASCII printable characters (should not be escaped) - KEYWORD + cases.add(new TestCaseSupplier("ASCII printable characters keyword", List.of(DataType.KEYWORD), () -> { + String input = randomAlphaOfLength(between(1, 100)); + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(input)) + ); + })); + + // Test with ASCII printable characters (should not be escaped) - TEXT + cases.add(new TestCaseSupplier("ASCII printable characters text", List.of(DataType.TEXT), () -> { + String input = randomAlphaOfLength(between(1, 100)); + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.TEXT, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(input)) + ); + })); + + // Test with Spanish accents + cases.add(new TestCaseSupplier("Spanish accents", List.of(DataType.KEYWORD), () -> { + String input = "Café naïve résumé"; + String expected = "Caf\\\\xe9 na\\\\xefve r\\\\xe9sum\\\\xe9"; + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(expected)) + ); + })); + + // Test with control characters + cases.add(new TestCaseSupplier("control characters", List.of(DataType.KEYWORD), () -> { + String input = "hello\nworld\r\t\"tab"; + String expected = "hello\\\\nworld\\\\r\\\\t\\\\\"tab"; + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(expected)) + ); + })); + + // Test with Chinese characters + cases.add(new TestCaseSupplier("Chinese characters", List.of(DataType.KEYWORD), () -> { + String input = "你好世界"; + String expected = "\\\\u4f60\\\\u597d\\\\u4e16\\\\u754c"; + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(expected)) + ); + })); + + // Test with Japanese characters + cases.add(new TestCaseSupplier("Japanese characters", List.of(DataType.KEYWORD), () -> { + String input = "こんにちは"; + String expected = "\\\\u3053\\\\u3093\\\\u306b\\\\u3061\\\\u306f"; + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(expected)) + ); + })); + + // Test with emojis (require 8-digit Unicode escape) + cases.add(new TestCaseSupplier("emojis", List.of(DataType.KEYWORD), () -> { + String input = "🚀🔥💧🪨"; + String expected = "\\\\U0001f680\\\\U0001f525\\\\U0001f4a7\\\\U0001faa8"; + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(expected)) + ); + })); + + // Test with Greek letters + cases.add(new TestCaseSupplier("Greek letters", List.of(DataType.KEYWORD), () -> { + String input = "αβγδε"; + String expected = "\\\\u03b1\\\\u03b2\\\\u03b3\\\\u03b4\\\\u03b5"; + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(expected)) + ); + })); + + // Test with mixed content + cases.add(new TestCaseSupplier("mixed content", List.of(DataType.KEYWORD), () -> { + String input = "Hello 世界! 🌍"; + String expected = "Hello \\\\u4e16\\\\u754c! \\\\U0001f30d"; + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(expected)) + ); + })); + + // Test with empty string + cases.add(new TestCaseSupplier("empty string", List.of(DataType.KEYWORD), () -> { + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(""), DataType.KEYWORD, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef("")) + ); + })); + + // Test with TEXT type + cases.add(new TestCaseSupplier("TEXT type", List.of(DataType.TEXT), () -> { + String input = "Café"; + String expected = "Caf\\\\xe9"; + return new TestCaseSupplier.TestCase( + List.of(new TestCaseSupplier.TypedData(new BytesRef(input), DataType.TEXT, "str")), + "ToAsciiEvaluator[val=Attribute[channel=0]]", + DataType.KEYWORD, + equalTo(new BytesRef(expected)) + ); + })); + + return parameterSuppliersFromTypedDataWithDefaultChecks(true, cases); + } + + @Override + protected Expression build(Source source, List args) { + return new ToAscii(source, args.get(0)); + } +}