Skip to content

Commit

Permalink
Add discard_punctuation to nori token filter (#4591) (#4618)
Browse files Browse the repository at this point in the history
Add discard_punctuation to nori token filter

Co-authored-by: Stuart Cam <stuart.cam@elastic.co>
  • Loading branch information
github-actions[bot] and codebrain committed Apr 17, 2020
1 parent 2e7c9bc commit 5b074c3
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/Nest/Analysis/Analyzers/NoriAnalyzer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ namespace Nest
/// <para> - nori_tokenizer</para>
/// <para> - nori_part_of_speech token filter</para>
/// <para> - nori_readingform token filter</para>
/// <para> - nori_number token filter</para>
/// <para> - lowercase token filter</para>
/// </summary>
public interface INoriAnalyzer : IAnalyzer
Expand Down
13 changes: 13 additions & 0 deletions src/Nest/Analysis/Tokenizers/NoriTokenizer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ public interface INoriTokenizer : ITokenizer
[DataMember(Name = "decompound_mode")]
NoriDecompoundMode? DecompoundMode { get; set; }

/// <summary>
/// Whether punctuation should be discarded from the output. Defaults to `true`.
/// </summary>
[DataMember(Name = "discard_punctuation")]
bool? DiscardPunctuation { get; set; }

/// <summary>
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be
/// appended to
Expand Down Expand Up @@ -57,6 +63,9 @@ public class NoriTokenizer : TokenizerBase, INoriTokenizer
/// <inheritdoc cref="INoriTokenizer.DecompoundMode" />
public NoriDecompoundMode? DecompoundMode { get; set; }

/// <inheritdoc cref="INoriTokenizer.DiscardPunctuation" />
public bool? DiscardPunctuation { get; set; }

/// <inheritdoc cref="INoriTokenizer.UserDictionary" />
public string UserDictionary { get; set; }

Expand All @@ -73,6 +82,7 @@ public class NoriTokenizerDescriptor
NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
string INoriTokenizer.UserDictionary { get; set; }
IEnumerable<string> INoriTokenizer.UserDictionaryRules { get; set; }
bool? INoriTokenizer.DiscardPunctuation { get; set; }

/// <inheritdoc cref="INoriTokenizer.DecompoundMode" />
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(mode, (a, v) => a.DecompoundMode = v);
Expand All @@ -85,5 +95,8 @@ public class NoriTokenizerDescriptor

/// <inheritdoc cref="INoriTokenizer.UserDictionaryRules" />
public NoriTokenizerDescriptor UserDictionaryRules(IEnumerable<string> rules) => Assign(rules, (a, v) => a.UserDictionaryRules = v);

/// <inheritdoc cref="INoriTokenizer.DiscardPunctuation" />
public NoriTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(discard, (a, v) => a.DiscardPunctuation = v);
}
}
16 changes: 16 additions & 0 deletions tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -332,5 +332,21 @@ public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>

public override string Name => "char_group";
}

[SkipVersion("<7.7.0", "discard_punctuation introduced in 7.7.0")]
public class DiscardPunctuationTests : TokenizerAssertionBase<DiscardPunctuationTests>
{
public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
.DiscardPunctuation()
);

public override ITokenizer Initializer => new NoriTokenizer
{
DiscardPunctuation = true
};

public override object Json => new { type = "nori_tokenizer", discard_punctuation = true };
public override string Name => "nori";
}
}
}

0 comments on commit 5b074c3

Please sign in to comment.