-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add analysis-nori plugin to writable cluster and added nori_tokenizer * add nori_part_of_speech token filter * add nori analyzer * add Nori() to AnalyzeTokenDescriptor, CodeStandard tests caught this
- Loading branch information
Showing
13 changed files
with
248 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
using System.Collections.Generic; | ||
using Newtonsoft.Json; | ||
|
||
namespace Nest | ||
{ | ||
/// <summary> | ||
///The nori analyzer consists of the following tokenizer and token filters: | ||
///<para> - nori_tokenizer</para> | ||
///<para> - nori_part_of_speech token filter</para> | ||
///<para> - nori_readingform token filter</para> | ||
///<para> - lowercase token filter</para> | ||
/// </summary> | ||
public interface INoriAnalyzer : IAnalyzer | ||
{ | ||
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/> | ||
[JsonProperty("decompound_mode")] | ||
NoriDecompoundMode? DecompoundMode { get; set; } | ||
|
||
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/> | ||
[JsonProperty("user_dictionary")] | ||
string UserDictionary { get; set; } | ||
|
||
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags"/> | ||
[JsonProperty("stoptags")] | ||
IEnumerable<string> StopTags { get; set; } | ||
} | ||
|
||
/// <inheritdoc cref="INoriAnalyzer"/> | ||
public class NoriAnalyzer : AnalyzerBase, INoriAnalyzer | ||
{ | ||
public NoriAnalyzer() : base("nori") {} | ||
|
||
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/> | ||
public NoriDecompoundMode? DecompoundMode { get; set; } | ||
|
||
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/> | ||
public string UserDictionary { get; set; } | ||
|
||
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" /> | ||
public IEnumerable<string> StopTags { get; set; } | ||
} | ||
|
||
/// <inheritdoc cref="INoriAnalyzer"/> | ||
public class NoriAnalyzerDescriptor : AnalyzerDescriptorBase<NoriAnalyzerDescriptor, INoriAnalyzer>, INoriAnalyzer | ||
{ | ||
protected override string Type => "nori"; | ||
|
||
NoriDecompoundMode? INoriAnalyzer.DecompoundMode { get; set; } | ||
string INoriAnalyzer.UserDictionary { get; set; } | ||
IEnumerable<string> INoriAnalyzer.StopTags { get; set; } | ||
|
||
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/> | ||
public NoriAnalyzerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(a => a.DecompoundMode = mode); | ||
|
||
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/> | ||
public NoriAnalyzerDescriptor UserDictionary(string path) => Assign(a => a.UserDictionary = path); | ||
|
||
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" /> | ||
public NoriAnalyzerDescriptor StopTags(IEnumerable<string> stopTags) => Assign(a => a.StopTags = stopTags); | ||
|
||
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" /> | ||
public NoriAnalyzerDescriptor StopTags(params string[] stopTags) => Assign(a => a.StopTags = stopTags); | ||
|
||
} | ||
} |
38 changes: 38 additions & 0 deletions
38
src/Nest/Analysis/TokenFilters/NoriPartOfSpeechTokenFilter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
using System.Collections.Generic; | ||
using Newtonsoft.Json; | ||
|
||
namespace Nest | ||
{ | ||
/// <summary> The nori_part_of_speech token filter removes tokens that match a set of part-of-speech tags. </summary> | ||
public interface INoriPartOfSpeechTokenFilter : ITokenFilter | ||
{ | ||
/// <summary> An array of part-of-speech tags that should be removed. </summary> | ||
[JsonProperty("stoptags")] | ||
IEnumerable<string> StopTags { get; set; } | ||
} | ||
|
||
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/> | ||
public class NoriPartOfSpeechTokenFilter : TokenFilterBase, INoriPartOfSpeechTokenFilter | ||
{ | ||
public NoriPartOfSpeechTokenFilter() : base("nori_part_of_speech") { } | ||
|
||
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" /> | ||
public IEnumerable<string> StopTags { get; set; } | ||
} | ||
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/> | ||
public class NoriPartOfSpeechTokenFilterDescriptor | ||
: TokenFilterDescriptorBase<NoriPartOfSpeechTokenFilterDescriptor, INoriPartOfSpeechTokenFilter>, INoriPartOfSpeechTokenFilter | ||
{ | ||
protected override string Type => "nori_part_of_speech"; | ||
|
||
IEnumerable<string> INoriPartOfSpeechTokenFilter.StopTags { get; set; } | ||
|
||
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" /> | ||
public NoriPartOfSpeechTokenFilterDescriptor StopTags(IEnumerable<string> stopTags) => Assign(a => a.StopTags = stopTags); | ||
|
||
///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" /> | ||
public NoriPartOfSpeechTokenFilterDescriptor StopTags(params string[] stopTags) => Assign(a => a.StopTags = stopTags); | ||
|
||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
using System.Runtime.Serialization; | ||
using Newtonsoft.Json; | ||
using Newtonsoft.Json.Converters; | ||
|
||
namespace Nest | ||
{ | ||
/// <summary> The decompound mode determines how the tokenizer handles compound tokens. </summary> | ||
[JsonConverter(typeof(StringEnumConverter))] | ||
public enum NoriDecompoundMode | ||
{ | ||
/// <summary> Decomposes compounds and discards the original form (default). </summary> | ||
[EnumMember(Value="discard")] | ||
Discard, | ||
/// <summary> No decomposition for compounds </summary> | ||
[EnumMember(Value="none")] | ||
None, | ||
/// <summary> Decomposes compounds and keeps the original form </summary> | ||
[EnumMember(Value="mixed")] | ||
Mixed | ||
} | ||
|
||
/// <summary> Tokenizer that ships with the analysis-nori plugin</summary> | ||
public interface INoriTokenizer : ITokenizer | ||
{ | ||
/// <summary> | ||
/// The regular expression pattern, defaults to \W+. | ||
/// </summary> | ||
[JsonProperty("decompound_mode")] | ||
NoriDecompoundMode? DecompoundMode { get; set; } | ||
|
||
/// <summary> | ||
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be appended to | ||
/// the default dictionary. This property allows you to specify this file on disk | ||
/// </summary> | ||
[JsonProperty("user_dictionary")] | ||
string UserDictionary { get; set; } | ||
} | ||
|
||
/// <inheritdoc cref="INoriTokenizer"/> | ||
public class NoriTokenizer : TokenizerBase, INoriTokenizer | ||
{ | ||
public NoriTokenizer() => this.Type = "nori_tokenizer"; | ||
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/> | ||
public NoriDecompoundMode? DecompoundMode { get; set; } | ||
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/> | ||
public string UserDictionary { get; set; } | ||
} | ||
/// <inheritdoc cref="INoriTokenizer"/> | ||
public class NoriTokenizerDescriptor | ||
: TokenizerDescriptorBase<NoriTokenizerDescriptor, INoriTokenizer>, INoriTokenizer | ||
{ | ||
protected override string Type => "nori_tokenizer"; | ||
|
||
NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; } | ||
string INoriTokenizer.UserDictionary { get; set; } | ||
|
||
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/> | ||
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(a => a.DecompoundMode = mode); | ||
|
||
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/> | ||
public NoriTokenizerDescriptor UserDictionary(string path) => Assign(a => a.UserDictionary = path); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters