diff --git a/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs b/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs index a7beb16ba0e..73f06b0aea8 100644 --- a/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs +++ b/src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs @@ -1,9 +1,10 @@ -// Licensed to Elasticsearch B.V under one or more agreements. +// Licensed to Elasticsearch B.V under one or more agreements. // Elasticsearch B.V licenses this file to you under the Apache 2.0 License. // See the LICENSE file in the project root for more information -using System.Collections.Generic; +using System.Collections.Generic; using System.Runtime.Serialization; +using Elasticsearch.Net.Utf8Json; namespace Nest { @@ -20,6 +21,16 @@ public interface ICharGroupTokenizer : ITokenizer /// [DataMember(Name ="tokenize_on_chars")] IEnumerable TokenizeOnCharacters { get; set; } + + /// + /// The maximum token length. If a token is seen that exceeds this length then + /// it is split at intervals. Defaults to `255`. + /// + /// Valid in Elasticsearch 7.9.0+ + /// + [DataMember(Name = "max_token_length")] + [JsonFormatter(typeof(NullableStringIntFormatter))] + int? MaxTokenLength { get; set; } } /// @@ -31,6 +42,9 @@ public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer /// public IEnumerable TokenizeOnCharacters { get; set; } + + /// + public int? MaxTokenLength { get; set; } } /// @@ -40,6 +54,7 @@ public class CharGroupTokenizerDescriptor protected override string Type => CharGroupTokenizer.TokenizerType; IEnumerable ICharGroupTokenizer.TokenizeOnCharacters { get; set; } + int? ICharGroupTokenizer.MaxTokenLength { get; set; } /// public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) => @@ -48,5 +63,9 @@ public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] charact /// public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable characters) => Assign(characters, (a, v) => a.TokenizeOnCharacters = v); + + /// + public CharGroupTokenizerDescriptor MaxTokenLength(int? maxTokenLength) => + Assign(maxTokenLength, (a, v) => a.MaxTokenLength = v); } } diff --git a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs index e120630dd26..f02b372c0ce 100644 --- a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs +++ b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs @@ -337,6 +337,32 @@ public class CharGroupTests : TokenizerAssertionBase public override string Name => "char_group"; } + [SkipVersion("<7.9.0", "max_token_length introduced in 7.9.0")] + public class CharGroupMaxTokenLengthTests : TokenizerAssertionBase + { + private readonly string[] _chars = { "whitespace", "-", "\n" }; + + public override FuncTokenizer Fluent => (n, t) => t.CharGroup(n, e => e + .TokenizeOnCharacters(_chars) + .MaxTokenLength(255) + ); + + public override ITokenizer Initializer => new CharGroupTokenizer + { + TokenizeOnCharacters = _chars, + MaxTokenLength = 255 + }; + + public override object Json => new + { + tokenize_on_chars = _chars, + type = "char_group", + max_token_length = 255 + }; + + public override string Name => "char_group_max_token_length"; + } + [SkipVersion("<7.7.0", "discard_punctuation introduced in 7.7.0")] public class DiscardPunctuationTests : TokenizerAssertionBase {