From 5b074c30ea9af2b5c81e004c9a365becbd045ec4 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 17 Apr 2020 14:12:03 +1000 Subject: [PATCH] Add discard_punctuation to nori token filter (#4591) (#4618) Add discard_punctuation to nori token filter Co-authored-by: Stuart Cam --- src/Nest/Analysis/Analyzers/NoriAnalyzer.cs | 1 + src/Nest/Analysis/Tokenizers/NoriTokenizer.cs | 13 +++++++++++++ .../Tests/Analysis/Tokenizers/TokenizerTests.cs | 16 ++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/src/Nest/Analysis/Analyzers/NoriAnalyzer.cs b/src/Nest/Analysis/Analyzers/NoriAnalyzer.cs index 78a7043dbb2..723abe6fe2e 100644 --- a/src/Nest/Analysis/Analyzers/NoriAnalyzer.cs +++ b/src/Nest/Analysis/Analyzers/NoriAnalyzer.cs @@ -8,6 +8,7 @@ namespace Nest /// - nori_tokenizer /// - nori_part_of_speech token filter /// - nori_readingform token filter + /// - nori_number token filter /// - lowercase token filter /// public interface INoriAnalyzer : IAnalyzer diff --git a/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs b/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs index 287d7ed2a68..fdf45c26d67 100644 --- a/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs +++ b/src/Nest/Analysis/Tokenizers/NoriTokenizer.cs @@ -30,6 +30,12 @@ public interface INoriTokenizer : ITokenizer [DataMember(Name = "decompound_mode")] NoriDecompoundMode? DecompoundMode { get; set; } + /// + /// Whether punctuation should be discarded from the output. Defaults to `true`. + /// + [DataMember(Name = "discard_punctuation")] + bool? DiscardPunctuation { get; set; } + /// /// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be /// appended to @@ -57,6 +63,9 @@ public class NoriTokenizer : TokenizerBase, INoriTokenizer /// public NoriDecompoundMode? DecompoundMode { get; set; } + /// + public bool? DiscardPunctuation { get; set; } + /// public string UserDictionary { get; set; } @@ -73,6 +82,7 @@ public class NoriTokenizerDescriptor NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; } string INoriTokenizer.UserDictionary { get; set; } IEnumerable INoriTokenizer.UserDictionaryRules { get; set; } + bool? INoriTokenizer.DiscardPunctuation { get; set; } /// public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(mode, (a, v) => a.DecompoundMode = v); @@ -85,5 +95,8 @@ public class NoriTokenizerDescriptor /// public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v); + + /// + public NoriTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v); } } diff --git a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs index 7e9711a8c75..63fb2f4cc03 100644 --- a/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs +++ b/tests/Tests/Analysis/Tokenizers/TokenizerTests.cs @@ -332,5 +332,21 @@ public class CharGroupTests : TokenizerAssertionBase public override string Name => "char_group"; } + + [SkipVersion("<7.7.0", "discard_punctuation introduced in 7.7.0")] + public class DiscardPunctuationTests : TokenizerAssertionBase + { + public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e + .DiscardPunctuation() + ); + + public override ITokenizer Initializer => new NoriTokenizer + { + DiscardPunctuation = true + }; + + public override object Json => new { type = "nori_tokenizer", discard_punctuation = true }; + public override string Name => "nori"; + } } }