diff --git a/src/Nest/Analysis/Tokenizers/SimplePatternSplitTokenizer.cs b/src/Nest/Analysis/Tokenizers/SimplePatternSplitTokenizer.cs new file mode 100644 index 00000000000..3e4117ea0de --- /dev/null +++ b/src/Nest/Analysis/Tokenizers/SimplePatternSplitTokenizer.cs @@ -0,0 +1,41 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System.Runtime.Serialization; + +namespace Nest +{ + /// + /// The simple_pattern tokenizer uses a regular expression to capture matching text as terms. + /// + public interface ISimplePatternSplitTokenizer : ITokenizer + { + /// + /// Lucene regular expression, defaults to the empty string. + /// + [DataMember(Name = "pattern")] + string Pattern { get; set; } + } + + /// + public class SimplePatternSplitTokenizer : TokenizerBase, ISimplePatternSplitTokenizer + { + public SimplePatternSplitTokenizer() => Type = "simple_pattern_split"; + + /// + public string Pattern { get; set; } + } + + /// + public class SimplePatternSplitTokenizerDescriptor + : TokenizerDescriptorBase, ISimplePatternSplitTokenizer + { + protected override string Type => "simple_pattern_split"; + + string ISimplePatternSplitTokenizer.Pattern { get; set; } + + /// + public SimplePatternSplitTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v); + } +} diff --git a/src/Nest/Analysis/Tokenizers/SimplePatternTokenizer.cs b/src/Nest/Analysis/Tokenizers/SimplePatternTokenizer.cs new file mode 100644 index 00000000000..4062d37d46f --- /dev/null +++ b/src/Nest/Analysis/Tokenizers/SimplePatternTokenizer.cs @@ -0,0 +1,41 @@ +// Licensed to Elasticsearch B.V under one or more agreements. +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. +// See the LICENSE file in the project root for more information + +using System.Runtime.Serialization; + +namespace Nest +{ + /// + /// The simple_pattern tokenizer uses a regular expression to capture matching text as terms. + /// + public interface ISimplePatternTokenizer : ITokenizer + { + /// + /// Lucene regular expression, defaults to the empty string. + /// + [DataMember(Name = "pattern")] + string Pattern { get; set; } + } + + /// + public class SimplePatternTokenizer : TokenizerBase, ISimplePatternTokenizer + { + public SimplePatternTokenizer() => Type = "simple_pattern"; + + /// + public string Pattern { get; set; } + } + + /// + public class SimplePatternTokenizerDescriptor + : TokenizerDescriptorBase, ISimplePatternTokenizer + { + protected override string Type => "simple_pattern"; + + string ISimplePatternTokenizer.Pattern { get; set; } + + /// + public SimplePatternTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v); + } +} diff --git a/src/Nest/Analysis/Tokenizers/Tokenizers.cs b/src/Nest/Analysis/Tokenizers/Tokenizers.cs index 01272a806a7..9f97362afdb 100644 --- a/src/Nest/Analysis/Tokenizers/Tokenizers.cs +++ b/src/Nest/Analysis/Tokenizers/Tokenizers.cs @@ -132,5 +132,13 @@ public TokenizersDescriptor Nori(string name, Func public TokenizersDescriptor CharGroup(string name, Func selector) => Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor())); + + /// > + public TokenizersDescriptor SimplePattern(string name, Func selector) => + Assign(name, selector?.Invoke(new SimplePatternTokenizerDescriptor())); + + /// > + public TokenizersDescriptor SimplePatternSplit(string name, Func selector) => + Assign(name, selector?.Invoke(new SimplePatternSplitTokenizerDescriptor())); } } diff --git a/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs b/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs index 79a967f9742..99fa36cb8d3 100644 --- a/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs +++ b/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs @@ -104,8 +104,15 @@ public ITokenizer Nori(Func selector) = selector.Invoke(new NoriTokenizerDescriptor()); /// - /// > public ITokenizer CharGroup(Func selector) => selector?.Invoke(new CharGroupTokenizerDescriptor()); + + /// > + public ITokenizer SimplePattern(Func selector) => + selector?.Invoke(new SimplePatternTokenizerDescriptor()); + + /// > + public ITokenizer SimplePatternSplit(Func selector) => + selector?.Invoke(new SimplePatternSplitTokenizerDescriptor()); } } diff --git a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs index 6a46ddeef2d..4742b4d609f 100644 --- a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs +++ b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs @@ -22,18 +22,10 @@ public class EdgeNGramTests : TokenizerAssertionBase public override ITokenizer Initializer => new EdgeNGramTokenizer { - MaxGram = 2, - MinGram = 1, - TokenChars = new[] { TokenChar.Digit, TokenChar.Letter } + MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter } }; - public override object Json => new - { - min_gram = 1, - max_gram = 2, - token_chars = new[] { "digit", "letter" }, - type = "edge_ngram" - }; + public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "edge_ngram" }; public override string Name => "endgen"; } @@ -50,10 +42,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase new EdgeNGramTokenizer { - MaxGram = 2, - MinGram = 1, - TokenChars = new[] { TokenChar.Custom }, - CustomTokenChars = "+-_" + MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_" }; public override object Json => new @@ -62,7 +51,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase "endgen_custom"; @@ -78,18 +67,10 @@ public class NGramTests : TokenizerAssertionBase public override ITokenizer Initializer => new NGramTokenizer { - MaxGram = 2, - MinGram = 1, - TokenChars = new[] { TokenChar.Digit, TokenChar.Letter } + MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter } }; - public override object Json => new - { - min_gram = 1, - max_gram = 2, - token_chars = new[] { "digit", "letter" }, - type = "ngram" - }; + public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "ngram" }; public override string Name => "ng"; } @@ -106,10 +87,7 @@ public class NGramCustomTokenCharsTests : TokenizerAssertionBase new NGramTokenizer { - MaxGram = 2, - MinGram = 1, - TokenChars = new[] { TokenChar.Custom }, - CustomTokenChars = "+-_" + MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_" }; public override object Json => new @@ -164,16 +142,9 @@ public class IcuTests : TokenizerAssertionBase .RuleFiles(RuleFiles) ); - public override ITokenizer Initializer => new IcuTokenizer - { - RuleFiles = RuleFiles, - }; + public override ITokenizer Initializer => new IcuTokenizer { RuleFiles = RuleFiles, }; - public override object Json => new - { - rule_files = RuleFiles, - type = "icu_tokenizer" - }; + public override object Json => new { rule_files = RuleFiles, type = "icu_tokenizer" }; public override string Name => "icu"; } @@ -198,7 +169,7 @@ public class KuromojiTests : TokenizerAssertionBase DiscardPunctuation = true, NBestExamples = Example, NBestCost = 1000, - UserDictionaryRules = new [] { Inline } + UserDictionaryRules = new[] { Inline } }; public override object Json => new @@ -208,7 +179,7 @@ public class KuromojiTests : TokenizerAssertionBase nbest_cost = 1000, nbest_examples = Example, type = "kuromoji_tokenizer", - user_dictionary_rules = new [] { Inline } + user_dictionary_rules = new[] { Inline } }; public override string Name => "kuro"; @@ -228,18 +199,9 @@ public class KuromojiDiscardCompoundTokenTests : TokenizerAssertionBase new KuromojiTokenizer - { - Mode = KuromojiTokenizationMode.Search, - DiscardCompoundToken = true, - }; + public override ITokenizer Initializer => new KuromojiTokenizer { Mode = KuromojiTokenizationMode.Search, DiscardCompoundToken = true, }; - public override object Json => new - { - discard_compound_token = true, - mode = "search", - type = "kuromoji_tokenizer", - }; + public override object Json => new { discard_compound_token = true, mode = "search", type = "kuromoji_tokenizer", }; public override string Name => "kuro_discard_compound_token"; } @@ -252,11 +214,7 @@ public class UaxTests : TokenizerAssertionBase public override ITokenizer Initializer => new UaxEmailUrlTokenizer { MaxTokenLength = 12 }; - public override object Json => new - { - max_token_length = 12, - type = "uax_url_email" - }; + public override object Json => new { max_token_length = 12, type = "uax_url_email" }; public override string Name => "uax"; } @@ -269,20 +227,9 @@ public class PatternTests : TokenizerAssertionBase .Pattern(@"\W+") ); - public override ITokenizer Initializer => new PatternTokenizer - { - Flags = "CASE_INSENSITIVE", - Group = 1, - Pattern = @"\W+" - }; + public override ITokenizer Initializer => new PatternTokenizer { Flags = "CASE_INSENSITIVE", Group = 1, Pattern = @"\W+" }; - public override object Json => new - { - pattern = @"\W+", - flags = "CASE_INSENSITIVE", - group = 1, - type = "pattern" - }; + public override object Json => new { pattern = @"\W+", flags = "CASE_INSENSITIVE", group = 1, type = "pattern" }; public override string Name => "pat"; } @@ -312,10 +259,7 @@ public class NoriTests : TokenizerAssertionBase .DecompoundMode(NoriDecompoundMode.Mixed) ); - public override ITokenizer Initializer => new NoriTokenizer - { - DecompoundMode = NoriDecompoundMode.Mixed - }; + public override ITokenizer Initializer => new NoriTokenizer { DecompoundMode = NoriDecompoundMode.Mixed }; public override object Json => new { type = "nori_tokenizer", decompound_mode = "mixed" }; public override string Name => "nori"; @@ -331,16 +275,14 @@ public class NoriWithUserDictionaryTests : TokenizerAssertionBase new NoriTokenizer { - DecompoundMode = NoriDecompoundMode.Mixed, - UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" } + DecompoundMode = NoriDecompoundMode.Mixed, UserDictionaryRules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" } }; public override object Json => new { - type = "nori_tokenizer", - decompound_mode = "mixed", - user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" } + type = "nori_tokenizer", decompound_mode = "mixed", user_dictionary_rules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" } }; + public override string Name => "nori_userdictionary"; } @@ -353,16 +295,9 @@ public class CharGroupTests : TokenizerAssertionBase .TokenizeOnCharacters(_chars) ); - public override ITokenizer Initializer => new CharGroupTokenizer - { - TokenizeOnCharacters = _chars - }; + public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars }; - public override object Json => new - { - tokenize_on_chars = _chars, - type = "char_group" - }; + public override object Json => new { tokenize_on_chars = _chars, type = "char_group" }; public override string Name => "char_group"; } @@ -377,18 +312,9 @@ public class CharGroupMaxTokenLengthTests : TokenizerAssertionBase new CharGroupTokenizer - { - TokenizeOnCharacters = _chars, - MaxTokenLength = 255 - }; + public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars, MaxTokenLength = 255 }; - public override object Json => new - { - tokenize_on_chars = _chars, - type = "char_group", - max_token_length = 255 - }; + public override object Json => new { tokenize_on_chars = _chars, type = "char_group", max_token_length = 255 }; public override string Name => "char_group_max_token_length"; } @@ -400,13 +326,38 @@ public class DiscardPunctuationTests : TokenizerAssertionBase new NoriTokenizer - { - DiscardPunctuation = true - }; + public override ITokenizer Initializer => new NoriTokenizer { DiscardPunctuation = true }; public override object Json => new { type = "nori_tokenizer", discard_punctuation = true }; public override string Name => "nori-discard"; } + + [SkipVersion("<7.7.0", "simple_pattern experimental until 7.7.0")] + public class SimplePatternTests : TokenizerAssertionBase + { + public override FuncTokenizer Fluent => (n, t) => t.SimplePattern(n, e => e + .Pattern(@"\W+") + ); + + public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" }; + + public override object Json => new { pattern = @"\W+", type = "simple_pattern" }; + + public override string Name => "simple-pattern"; + } + + [SkipVersion("<7.7.0", "simple_pattern_split experimental until 7.7.0")] + public class SimplePatternSplitTests : TokenizerAssertionBase + { + public override FuncTokenizer Fluent => (n, t) => t.SimplePatternSplit(n, e => e + .Pattern(@"\W+") + ); + + public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" }; + + public override object Json => new { pattern = @"\W+", type = "simple_pattern_split" }; + + public override string Name => "simple-pattern-split"; + } } }