diff --git a/src/Nest/Analysis/Tokenizers/SimplePatternSplitTokenizer.cs b/src/Nest/Analysis/Tokenizers/SimplePatternSplitTokenizer.cs
new file mode 100644
index 00000000000..3e4117ea0de
--- /dev/null
+++ b/src/Nest/Analysis/Tokenizers/SimplePatternSplitTokenizer.cs
@@ -0,0 +1,41 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using System.Runtime.Serialization;
+
+namespace Nest
+{
+ ///
+ /// The simple_pattern tokenizer uses a regular expression to capture matching text as terms.
+ ///
+ public interface ISimplePatternSplitTokenizer : ITokenizer
+ {
+ ///
+ /// Lucene regular expression, defaults to the empty string.
+ ///
+ [DataMember(Name = "pattern")]
+ string Pattern { get; set; }
+ }
+
+ ///
+ public class SimplePatternSplitTokenizer : TokenizerBase, ISimplePatternSplitTokenizer
+ {
+ public SimplePatternSplitTokenizer() => Type = "simple_pattern_split";
+
+ ///
+ public string Pattern { get; set; }
+ }
+
+ ///
+ public class SimplePatternSplitTokenizerDescriptor
+ : TokenizerDescriptorBase, ISimplePatternSplitTokenizer
+ {
+ protected override string Type => "simple_pattern_split";
+
+ string ISimplePatternSplitTokenizer.Pattern { get; set; }
+
+ ///
+ public SimplePatternSplitTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v);
+ }
+}
diff --git a/src/Nest/Analysis/Tokenizers/SimplePatternTokenizer.cs b/src/Nest/Analysis/Tokenizers/SimplePatternTokenizer.cs
new file mode 100644
index 00000000000..4062d37d46f
--- /dev/null
+++ b/src/Nest/Analysis/Tokenizers/SimplePatternTokenizer.cs
@@ -0,0 +1,41 @@
+// Licensed to Elasticsearch B.V under one or more agreements.
+// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
+// See the LICENSE file in the project root for more information
+
+using System.Runtime.Serialization;
+
+namespace Nest
+{
+ ///
+ /// The simple_pattern tokenizer uses a regular expression to capture matching text as terms.
+ ///
+ public interface ISimplePatternTokenizer : ITokenizer
+ {
+ ///
+ /// Lucene regular expression, defaults to the empty string.
+ ///
+ [DataMember(Name = "pattern")]
+ string Pattern { get; set; }
+ }
+
+ ///
+ public class SimplePatternTokenizer : TokenizerBase, ISimplePatternTokenizer
+ {
+ public SimplePatternTokenizer() => Type = "simple_pattern";
+
+ ///
+ public string Pattern { get; set; }
+ }
+
+ ///
+ public class SimplePatternTokenizerDescriptor
+ : TokenizerDescriptorBase, ISimplePatternTokenizer
+ {
+ protected override string Type => "simple_pattern";
+
+ string ISimplePatternTokenizer.Pattern { get; set; }
+
+ ///
+ public SimplePatternTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v);
+ }
+}
diff --git a/src/Nest/Analysis/Tokenizers/Tokenizers.cs b/src/Nest/Analysis/Tokenizers/Tokenizers.cs
index 01272a806a7..9f97362afdb 100644
--- a/src/Nest/Analysis/Tokenizers/Tokenizers.cs
+++ b/src/Nest/Analysis/Tokenizers/Tokenizers.cs
@@ -132,5 +132,13 @@ public TokenizersDescriptor Nori(string name, Func
public TokenizersDescriptor CharGroup(string name, Func selector) =>
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));
+
+ /// >
+ public TokenizersDescriptor SimplePattern(string name, Func selector) =>
+ Assign(name, selector?.Invoke(new SimplePatternTokenizerDescriptor()));
+
+ /// >
+ public TokenizersDescriptor SimplePatternSplit(string name, Func selector) =>
+ Assign(name, selector?.Invoke(new SimplePatternSplitTokenizerDescriptor()));
}
}
diff --git a/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs b/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
index 79a967f9742..99fa36cb8d3 100644
--- a/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
+++ b/src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
@@ -104,8 +104,15 @@ public ITokenizer Nori(Func selector) =
selector.Invoke(new NoriTokenizerDescriptor());
///
- /// >
public ITokenizer CharGroup(Func selector) =>
selector?.Invoke(new CharGroupTokenizerDescriptor());
+
+ /// >
+ public ITokenizer SimplePattern(Func selector) =>
+ selector?.Invoke(new SimplePatternTokenizerDescriptor());
+
+ /// >
+ public ITokenizer SimplePatternSplit(Func selector) =>
+ selector?.Invoke(new SimplePatternSplitTokenizerDescriptor());
}
}
diff --git a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
index 6a46ddeef2d..4742b4d609f 100644
--- a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
+++ b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
@@ -22,18 +22,10 @@ public class EdgeNGramTests : TokenizerAssertionBase
public override ITokenizer Initializer => new EdgeNGramTokenizer
{
- MaxGram = 2,
- MinGram = 1,
- TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
+ MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
};
- public override object Json => new
- {
- min_gram = 1,
- max_gram = 2,
- token_chars = new[] { "digit", "letter" },
- type = "edge_ngram"
- };
+ public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "edge_ngram" };
public override string Name => "endgen";
}
@@ -50,10 +42,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase new EdgeNGramTokenizer
{
- MaxGram = 2,
- MinGram = 1,
- TokenChars = new[] { TokenChar.Custom },
- CustomTokenChars = "+-_"
+ MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_"
};
public override object Json => new
@@ -62,7 +51,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase "endgen_custom";
@@ -78,18 +67,10 @@ public class NGramTests : TokenizerAssertionBase
public override ITokenizer Initializer => new NGramTokenizer
{
- MaxGram = 2,
- MinGram = 1,
- TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
+ MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
};
- public override object Json => new
- {
- min_gram = 1,
- max_gram = 2,
- token_chars = new[] { "digit", "letter" },
- type = "ngram"
- };
+ public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "ngram" };
public override string Name => "ng";
}
@@ -106,10 +87,7 @@ public class NGramCustomTokenCharsTests : TokenizerAssertionBase new NGramTokenizer
{
- MaxGram = 2,
- MinGram = 1,
- TokenChars = new[] { TokenChar.Custom },
- CustomTokenChars = "+-_"
+ MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_"
};
public override object Json => new
@@ -164,16 +142,9 @@ public class IcuTests : TokenizerAssertionBase
.RuleFiles(RuleFiles)
);
- public override ITokenizer Initializer => new IcuTokenizer
- {
- RuleFiles = RuleFiles,
- };
+ public override ITokenizer Initializer => new IcuTokenizer { RuleFiles = RuleFiles, };
- public override object Json => new
- {
- rule_files = RuleFiles,
- type = "icu_tokenizer"
- };
+ public override object Json => new { rule_files = RuleFiles, type = "icu_tokenizer" };
public override string Name => "icu";
}
@@ -198,7 +169,7 @@ public class KuromojiTests : TokenizerAssertionBase
DiscardPunctuation = true,
NBestExamples = Example,
NBestCost = 1000,
- UserDictionaryRules = new [] { Inline }
+ UserDictionaryRules = new[] { Inline }
};
public override object Json => new
@@ -208,7 +179,7 @@ public class KuromojiTests : TokenizerAssertionBase
nbest_cost = 1000,
nbest_examples = Example,
type = "kuromoji_tokenizer",
- user_dictionary_rules = new [] { Inline }
+ user_dictionary_rules = new[] { Inline }
};
public override string Name => "kuro";
@@ -228,18 +199,9 @@ public class KuromojiDiscardCompoundTokenTests : TokenizerAssertionBase new KuromojiTokenizer
- {
- Mode = KuromojiTokenizationMode.Search,
- DiscardCompoundToken = true,
- };
+ public override ITokenizer Initializer => new KuromojiTokenizer { Mode = KuromojiTokenizationMode.Search, DiscardCompoundToken = true, };
- public override object Json => new
- {
- discard_compound_token = true,
- mode = "search",
- type = "kuromoji_tokenizer",
- };
+ public override object Json => new { discard_compound_token = true, mode = "search", type = "kuromoji_tokenizer", };
public override string Name => "kuro_discard_compound_token";
}
@@ -252,11 +214,7 @@ public class UaxTests : TokenizerAssertionBase
public override ITokenizer Initializer => new UaxEmailUrlTokenizer { MaxTokenLength = 12 };
- public override object Json => new
- {
- max_token_length = 12,
- type = "uax_url_email"
- };
+ public override object Json => new { max_token_length = 12, type = "uax_url_email" };
public override string Name => "uax";
}
@@ -269,20 +227,9 @@ public class PatternTests : TokenizerAssertionBase
.Pattern(@"\W+")
);
- public override ITokenizer Initializer => new PatternTokenizer
- {
- Flags = "CASE_INSENSITIVE",
- Group = 1,
- Pattern = @"\W+"
- };
+ public override ITokenizer Initializer => new PatternTokenizer { Flags = "CASE_INSENSITIVE", Group = 1, Pattern = @"\W+" };
- public override object Json => new
- {
- pattern = @"\W+",
- flags = "CASE_INSENSITIVE",
- group = 1,
- type = "pattern"
- };
+ public override object Json => new { pattern = @"\W+", flags = "CASE_INSENSITIVE", group = 1, type = "pattern" };
public override string Name => "pat";
}
@@ -312,10 +259,7 @@ public class NoriTests : TokenizerAssertionBase
.DecompoundMode(NoriDecompoundMode.Mixed)
);
- public override ITokenizer Initializer => new NoriTokenizer
- {
- DecompoundMode = NoriDecompoundMode.Mixed
- };
+ public override ITokenizer Initializer => new NoriTokenizer { DecompoundMode = NoriDecompoundMode.Mixed };
public override object Json => new { type = "nori_tokenizer", decompound_mode = "mixed" };
public override string Name => "nori";
@@ -331,16 +275,14 @@ public class NoriWithUserDictionaryTests : TokenizerAssertionBase new NoriTokenizer
{
- DecompoundMode = NoriDecompoundMode.Mixed,
- UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
+ DecompoundMode = NoriDecompoundMode.Mixed, UserDictionaryRules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" }
};
public override object Json => new
{
- type = "nori_tokenizer",
- decompound_mode = "mixed",
- user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
+ type = "nori_tokenizer", decompound_mode = "mixed", user_dictionary_rules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" }
};
+
public override string Name => "nori_userdictionary";
}
@@ -353,16 +295,9 @@ public class CharGroupTests : TokenizerAssertionBase
.TokenizeOnCharacters(_chars)
);
- public override ITokenizer Initializer => new CharGroupTokenizer
- {
- TokenizeOnCharacters = _chars
- };
+ public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars };
- public override object Json => new
- {
- tokenize_on_chars = _chars,
- type = "char_group"
- };
+ public override object Json => new { tokenize_on_chars = _chars, type = "char_group" };
public override string Name => "char_group";
}
@@ -377,18 +312,9 @@ public class CharGroupMaxTokenLengthTests : TokenizerAssertionBase new CharGroupTokenizer
- {
- TokenizeOnCharacters = _chars,
- MaxTokenLength = 255
- };
+ public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars, MaxTokenLength = 255 };
- public override object Json => new
- {
- tokenize_on_chars = _chars,
- type = "char_group",
- max_token_length = 255
- };
+ public override object Json => new { tokenize_on_chars = _chars, type = "char_group", max_token_length = 255 };
public override string Name => "char_group_max_token_length";
}
@@ -400,13 +326,38 @@ public class DiscardPunctuationTests : TokenizerAssertionBase new NoriTokenizer
- {
- DiscardPunctuation = true
- };
+ public override ITokenizer Initializer => new NoriTokenizer { DiscardPunctuation = true };
public override object Json => new { type = "nori_tokenizer", discard_punctuation = true };
public override string Name => "nori-discard";
}
+
+ [SkipVersion("<7.7.0", "simple_pattern experimental until 7.7.0")]
+ public class SimplePatternTests : TokenizerAssertionBase
+ {
+ public override FuncTokenizer Fluent => (n, t) => t.SimplePattern(n, e => e
+ .Pattern(@"\W+")
+ );
+
+ public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" };
+
+ public override object Json => new { pattern = @"\W+", type = "simple_pattern" };
+
+ public override string Name => "simple-pattern";
+ }
+
+ [SkipVersion("<7.7.0", "simple_pattern_split experimental until 7.7.0")]
+ public class SimplePatternSplitTests : TokenizerAssertionBase
+ {
+ public override FuncTokenizer Fluent => (n, t) => t.SimplePatternSplit(n, e => e
+ .Pattern(@"\W+")
+ );
+
+ public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" };
+
+ public override object Json => new { pattern = @"\W+", type = "simple_pattern_split" };
+
+ public override string Name => "simple-pattern-split";
+ }
}
}