diff --git a/src/Nest/Analysis/Analysis.cs b/src/Nest/Analysis/Analysis.cs index dd2b1b4f833..93e73ad86b1 100644 --- a/src/Nest/Analysis/Analysis.cs +++ b/src/Nest/Analysis/Analysis.cs @@ -21,7 +21,6 @@ public class Analysis : IAnalysis public ICharFilters CharFilters { get; set; } public ITokenFilters TokenFilters { get; set; } public ITokenizers Tokenizers { get; set; } - } public class AnalysisDescriptor : DescriptorBase, IAnalysis @@ -44,4 +43,4 @@ public AnalysisDescriptor Tokenizers(Func a.Tokenizers = selector?.Invoke(new TokenizersDescriptor())?.Value); } -} \ No newline at end of file +} diff --git a/src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs b/src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs index 13e3f0819ee..215bc72b5b7 100644 --- a/src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs +++ b/src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs @@ -28,6 +28,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist case "whitespace": return o.ToObject(ElasticContractResolver.Empty); case "simple": return o.ToObject(ElasticContractResolver.Empty); case "fingerprint": return o.ToObject(ElasticContractResolver.Empty); + case "kuromoji": return o.ToObject(ElasticContractResolver.Empty); default: if (o.Property("tokenizer") != null) return o.ToObject(ElasticContractResolver.Empty); diff --git a/src/Nest/Analysis/Analyzers/Analyzers.cs b/src/Nest/Analysis/Analyzers/Analyzers.cs index dfb3a9a5798..990d9882417 100644 --- a/src/Nest/Analysis/Analyzers/Analyzers.cs +++ b/src/Nest/Analysis/Analyzers/Analyzers.cs @@ -91,5 +91,11 @@ public AnalyzersDescriptor Whitespace(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new FingerprintAnalyzerDescriptor())); + /// + /// An analyzer tailored for japanese that is bootstrapped with defaults. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public AnalyzersDescriptor Kuromoji(string name, Func selector = null) => + Assign(name, selector.InvokeOrDefault(new KuromojiAnalyzerDescriptor())); } } diff --git a/src/Nest/Analysis/CharFilters/CharFilterJsonConverter.cs b/src/Nest/Analysis/CharFilters/CharFilterJsonConverter.cs index ec159a5995a..5d225d60f4f 100644 --- a/src/Nest/Analysis/CharFilters/CharFilterJsonConverter.cs +++ b/src/Nest/Analysis/CharFilters/CharFilterJsonConverter.cs @@ -23,6 +23,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist case "html_strip": return o.ToObject(ElasticContractResolver.Empty); case "mapping": return o.ToObject(ElasticContractResolver.Empty); case "pattern_replace": return o.ToObject(ElasticContractResolver.Empty); + case "kuromoji_iteration_mark": return o.ToObject(ElasticContractResolver.Empty); } return null; } diff --git a/src/Nest/Analysis/CharFilters/CharFilters.cs b/src/Nest/Analysis/CharFilters/CharFilters.cs index 53616fb66e2..1d64057e01b 100644 --- a/src/Nest/Analysis/CharFilters/CharFilters.cs +++ b/src/Nest/Analysis/CharFilters/CharFilters.cs @@ -24,9 +24,9 @@ public class CharFiltersDescriptor : IsADictionaryDescriptorBase Assign(name, analyzer); - + /// - /// The pattern_replace char filter allows the use of a regex to manipulate the characters in a string before analysis. + /// The pattern_replace char filter allows the use of a regex to manipulate the characters in a string before analysis. /// public CharFiltersDescriptor PatternReplace(string name, Func selector) => Assign(name, selector?.Invoke(new PatternReplaceCharFilterDescriptor())); @@ -43,5 +43,12 @@ public CharFiltersDescriptor HtmlStrip(string name, Func selector) => Assign(name, selector?.Invoke(new MappingCharFilterDescriptor())); + /// + /// The kuromoji_iteration_mark normalizes Japanese horizontal iteration marks (odoriji) to their expanded form. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public CharFiltersDescriptor KuromojiIterationMark(string name, Func selector = null) => + Assign(name, selector?.InvokeOrDefault(new KuromojiIterationMarkCharFilterDescriptor())); + } } diff --git a/src/Nest/Analysis/Plugins/Kuromoji/KuromojiAnalyzer.cs b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiAnalyzer.cs new file mode 100644 index 00000000000..d99140f0ce6 --- /dev/null +++ b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiAnalyzer.cs @@ -0,0 +1,43 @@ +using System.Collections.Generic; +using Newtonsoft.Json; + +namespace Nest +{ + /// + /// An analyzer tailored for japanese that is bootstrapped with defaults. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public interface IKuromojiAnalyzer : IAnalyzer + { + [JsonProperty("mode")] + KuromojiTokenizationMode? Mode { get; set; } + + [JsonProperty("user_dictionary")] + string UserDictionary { get; set; } + } + + /// + public class KuromojiAnalyzer : AnalyzerBase, IKuromojiAnalyzer + { + public KuromojiAnalyzer() : base("kuromoji") {} + + public KuromojiTokenizationMode? Mode { get; set; } + + public string UserDictionary { get; set; } + } + + /// + public class KuromojiAnalyzerDescriptor : + AnalyzerDescriptorBase, IKuromojiAnalyzer + { + protected override string Type => "kuromoji"; + + KuromojiTokenizationMode? IKuromojiAnalyzer.Mode { get; set; } + string IKuromojiAnalyzer.UserDictionary { get; set; } + + public KuromojiAnalyzerDescriptor Mode(KuromojiTokenizationMode? mode) => Assign(a => a.Mode = mode); + + public KuromojiAnalyzerDescriptor UserDictionary(string userDictionary) => Assign(a => a.UserDictionary = userDictionary); + + } +} diff --git a/src/Nest/Analysis/Plugins/Kuromoji/KuromojiIterationMarkCharFilter.cs b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiIterationMarkCharFilter.cs new file mode 100644 index 00000000000..9f30fc52c79 --- /dev/null +++ b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiIterationMarkCharFilter.cs @@ -0,0 +1,47 @@ +using System.Collections.Generic; +using Newtonsoft.Json; + +namespace Nest +{ + /// + /// The kuromoji_iteration_mark normalizes Japanese horizontal iteration marks (odoriji) to their expanded form. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public interface IKuromojiIterationMarkCharFilter : ICharFilter + { + [JsonProperty("normalize_kanji")] + bool? NormalizeKanji { get; set; } + + [JsonProperty("normalize_kana")] + bool? NormalizeKana { get; set; } + } + /// + public class KuromojiIterationMarkCharFilter : CharFilterBase, IKuromojiIterationMarkCharFilter + { + public KuromojiIterationMarkCharFilter() : base("kuromoji_iteration_mark") { } + + /// + public bool? NormalizeKanji { get; set; } + + /// + public bool? NormalizeKana { get; set; } + } + + /// + public class KuromojiIterationMarkCharFilterDescriptor + : CharFilterDescriptorBase, IKuromojiIterationMarkCharFilter + { + protected override string Type => "kuromoji_iteration_mark"; + bool? IKuromojiIterationMarkCharFilter.NormalizeKanji { get; set; } + bool? IKuromojiIterationMarkCharFilter.NormalizeKana { get; set; } + + /// + public KuromojiIterationMarkCharFilterDescriptor NormalizeKanji(bool? normalize = true) => + Assign(a => a.NormalizeKanji = normalize); + + /// + public KuromojiIterationMarkCharFilterDescriptor NormalizeKana(bool? normalize = true) => + Assign(a => a.NormalizeKana = normalize); + + } +} diff --git a/src/Nest/Analysis/Plugins/Kuromoji/KuromojiPartOfSpeechTokenFilter.cs b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiPartOfSpeechTokenFilter.cs new file mode 100644 index 00000000000..b244f976973 --- /dev/null +++ b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiPartOfSpeechTokenFilter.cs @@ -0,0 +1,45 @@ +using System.Collections.Generic; +using Newtonsoft.Json; + +namespace Nest +{ + /// + /// The kuromoji_part_of_speech token filter removes tokens that match a set of part-of-speech tags. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public interface IKuromojiPartOfSpeechTokenFilter : ITokenFilter + { + /// + /// An array of part-of-speech tags that should be removed. It defaults to the stoptags.txt file embedded + /// in the lucene-analyzer-kuromoji.jar. + /// + [JsonProperty("stoptags")] + IEnumerable StopTags { get; set; } + } + + /// + public class KuromojiPartOfSpeechTokenFilter : TokenFilterBase, IKuromojiPartOfSpeechTokenFilter + { + public KuromojiPartOfSpeechTokenFilter() : base("kuromoji_part_of_speech") { } + + /// + public IEnumerable StopTags { get; set; } + } + + /// + public class KuromojiPartOfSpeechTokenFilterDescriptor + : TokenFilterDescriptorBase, IKuromojiPartOfSpeechTokenFilter + { + protected override string Type => "kuromoji_part_of_speech"; + + IEnumerable IKuromojiPartOfSpeechTokenFilter.StopTags { get; set; } + + /// + public KuromojiPartOfSpeechTokenFilterDescriptor StopTags(IEnumerable stopTags) => Assign(a => a.StopTags = stopTags); + + /// + public KuromojiPartOfSpeechTokenFilterDescriptor StopTags(params string[] stopTags) => Assign(a => a.StopTags = stopTags); + + } + +} diff --git a/src/Nest/Analysis/Plugins/Kuromoji/KuromojiReadingFormTokenFilter.cs b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiReadingFormTokenFilter.cs new file mode 100644 index 00000000000..4a3d21f53f6 --- /dev/null +++ b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiReadingFormTokenFilter.cs @@ -0,0 +1,42 @@ +using System.Collections.Generic; +using Newtonsoft.Json; + +namespace Nest +{ + /// + /// The kuromoji_readingform token filter replaces the token with its reading form in either katakana or romaji. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public interface IKuromojiReadingFormTokenFilter : ITokenFilter + { + /// + /// Whether romaji reading form should be output instead of katakana. Defaults to false. + /// + [JsonProperty("use_romaji")] + bool? UseRomaji { get; set; } + } + + /// + public class KuromojiReadingFormTokenFilter : TokenFilterBase, IKuromojiReadingFormTokenFilter + { + public KuromojiReadingFormTokenFilter() : base("kuromoji_readingform") { } + + /// + public bool? UseRomaji { get; set; } + } + + /// + public class KuromojiReadingFormTokenFilterDescriptor + : TokenFilterDescriptorBase, IKuromojiReadingFormTokenFilter + { + protected override string Type => "kuromoji_readingform"; + + bool? IKuromojiReadingFormTokenFilter.UseRomaji { get; set; } + + /// + public KuromojiReadingFormTokenFilterDescriptor UseRomaji(bool? useRomaji = true) => Assign(a => a.UseRomaji = useRomaji); + + + } + +} diff --git a/src/Nest/Analysis/Plugins/Kuromoji/KuromojiStemmerTokenFilter.cs b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiStemmerTokenFilter.cs new file mode 100644 index 00000000000..503923d2724 --- /dev/null +++ b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiStemmerTokenFilter.cs @@ -0,0 +1,41 @@ +using System; +using System.Collections.Generic; +using Newtonsoft.Json; + +namespace Nest +{ + /// + /// The kuromoji_stemmer token filter normalizes common katakana spelling variations ending in a + /// long sound character by removing this character (U+30FC). Only full-width katakana characters are supported. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public interface IKuromojiStemmerTokenFilter : ITokenFilter + { + /// + /// Katakana words shorter than the minimum length are not stemmed (default is 4). + /// + [JsonProperty("minimum_length")] + int? MinimumLength { get; set; } + } + + /// + public class KuromojiStemmerTokenFilter : TokenFilterBase, IKuromojiStemmerTokenFilter + { + public KuromojiStemmerTokenFilter() : base("kuromoji_stemmer") { } + + /// + public int? MinimumLength { get; set; } + } + + /// + public class KuromojiStemmerTokenFilterDescriptor + : TokenFilterDescriptorBase, IKuromojiStemmerTokenFilter + { + protected override string Type => "kuromoji_stemmer"; + + int? IKuromojiStemmerTokenFilter.MinimumLength { get; set; } + + /// + public KuromojiStemmerTokenFilterDescriptor MinimumLength(int? minimumLength) => Assign(a => a.MinimumLength = minimumLength); + } +} diff --git a/src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizationMode.cs b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizationMode.cs new file mode 100644 index 00000000000..b66b54429f2 --- /dev/null +++ b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizationMode.cs @@ -0,0 +1,32 @@ +using System.Runtime.Serialization; +using Newtonsoft.Json; +using Newtonsoft.Json.Converters; + +namespace Nest +{ + /// + /// The tokenization mode determines how the tokenizer handles compound and unknown words. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + [JsonConverter(typeof(StringEnumConverter))] + public enum KuromojiTokenizationMode + { + /// + /// Normal segmentation, no decomposition for compounds + /// + [EnumMember(Value = "normal")] + Normal, + /// + /// Segmentation geared towards search. This includes a decompounding process for long nouns, + /// also including the full compound token as a synonym. + /// + [EnumMember(Value = "search")] + Search, + /// + /// Extended mode outputs unigrams for unknown words. + /// + [EnumMember(Value = "extended")] + Extended + + } +} diff --git a/src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizer.cs b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizer.cs new file mode 100644 index 00000000000..7c1eafe9487 --- /dev/null +++ b/src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizer.cs @@ -0,0 +1,94 @@ +using Newtonsoft.Json; + +namespace Nest +{ + /// + /// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public interface IKuromojiTokenizer : ITokenizer + { + /// + /// The tokenization mode determines how the tokenizer handles compound and unknown words. + /// + [JsonProperty("mode")] + KuromojiTokenizationMode? Mode { get; set; } + + /// + /// Whether punctuation should be discarded from the output. Defaults to true. + /// + [JsonProperty("discard_punctuation")] + bool? DiscardPunctuation { get; set; } + + /// + /// The Kuromoji tokenizer uses the MeCab-IPADIC dictionary by default. A user_dictionary may be + /// appended to the default dictionary. + /// + [JsonProperty("user_dictionary")] + string UserDictionary { get; set; } + + /// + ///The nbest_examples can be used to find a nbest_cost value based on examples. For example, + /// a value of /箱根山-箱根/成田空港-成田/ indicates that in the texts, 箱根山 (Mt. Hakone) and 成田空港 (Narita Airport) + /// we’d like a cost that gives is us 箱根 (Hakone) and 成田 (Narita). + /// + [JsonProperty("nbest_examples")] + string NBestExamples { get; set; } + + /// + /// The nbest_cost parameter specifies an additional Viterbi cost. The KuromojiTokenizer will include all tokens in + /// Viterbi paths that are within the nbest_cost value of the best path. + /// + [JsonProperty("nbest_cost")] + int? NBestCost { get; set; } + } + + /// + public class KuromojiTokenizer : TokenizerBase, IKuromojiTokenizer + { + public KuromojiTokenizer() { Type = "kuromoji_tokenizer"; } + + /// + public KuromojiTokenizationMode? Mode { get; set; } + + /// + public bool? DiscardPunctuation { get; set; } + + /// + public string UserDictionary { get; set; } + + /// + public string NBestExamples { get; set; } + + /// + public int? NBestCost { get; set; } + } + + /// + public class KuromojiTokenizerDescriptor + : TokenizerDescriptorBase, IKuromojiTokenizer + { + protected override string Type => "kuromoji_tokenizer"; + + KuromojiTokenizationMode? IKuromojiTokenizer.Mode { get; set; } + bool? IKuromojiTokenizer.DiscardPunctuation { get; set; } + string IKuromojiTokenizer.UserDictionary { get; set; } + string IKuromojiTokenizer.NBestExamples { get; set; } + int? IKuromojiTokenizer.NBestCost { get; set; } + + /// + public KuromojiTokenizerDescriptor Mode(KuromojiTokenizationMode? mode) => Assign(a => a.Mode = mode); + + /// + public KuromojiTokenizerDescriptor DiscardPunctuation(bool? discard = true) => Assign(a => a.DiscardPunctuation = discard); + + /// + public KuromojiTokenizerDescriptor UserDictionary(string userDictionary) => Assign(a => a.UserDictionary = userDictionary); + + /// + public KuromojiTokenizerDescriptor NBestExamples(string examples) => Assign(a => a.NBestExamples = examples); + + /// + public KuromojiTokenizerDescriptor NBestCost(int? cost) => Assign(a => a.NBestCost = cost); + } +} diff --git a/src/Nest/Analysis/TokenFilters/Phonetic/PhoneticEncoder.cs b/src/Nest/Analysis/Plugins/Phonetic/PhoneticEncoder.cs similarity index 100% rename from src/Nest/Analysis/TokenFilters/Phonetic/PhoneticEncoder.cs rename to src/Nest/Analysis/Plugins/Phonetic/PhoneticEncoder.cs diff --git a/src/Nest/Analysis/TokenFilters/Phonetic/PhoneticTokenFilter.cs b/src/Nest/Analysis/Plugins/Phonetic/PhoneticTokenFilter.cs similarity index 100% rename from src/Nest/Analysis/TokenFilters/Phonetic/PhoneticTokenFilter.cs rename to src/Nest/Analysis/Plugins/Phonetic/PhoneticTokenFilter.cs diff --git a/src/Nest/Analysis/TokenFilters/TokenFilterJsonConverter.cs b/src/Nest/Analysis/TokenFilters/TokenFilterJsonConverter.cs index 0896a502923..64778db3a78 100644 --- a/src/Nest/Analysis/TokenFilters/TokenFilterJsonConverter.cs +++ b/src/Nest/Analysis/TokenFilters/TokenFilterJsonConverter.cs @@ -54,6 +54,9 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist case "uppercase": return o.ToObject(ElasticContractResolver.Empty); case "word_delimiter": return o.ToObject(ElasticContractResolver.Empty); case "fingerprint": return o.ToObject(ElasticContractResolver.Empty); + case "kuromoji_readingform": return o.ToObject(ElasticContractResolver.Empty); + case "kuromoji_part_of_speech": return o.ToObject(ElasticContractResolver.Empty); + case "kuromoji_stemmer": return o.ToObject(ElasticContractResolver.Empty); } return null; } diff --git a/src/Nest/Analysis/TokenFilters/TokenFilters.cs b/src/Nest/Analysis/TokenFilters/TokenFilters.cs index f462dbed28c..867ce37807c 100644 --- a/src/Nest/Analysis/TokenFilters/TokenFilters.cs +++ b/src/Nest/Analysis/TokenFilters/TokenFilters.cs @@ -239,5 +239,26 @@ public TokenFiltersDescriptor Uppercase(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new FingerprintTokenFilterDescriptor())); + /// + /// The kuromoji_stemmer token filter normalizes common katakana spelling variations ending in a + /// long sound character by removing this character (U+30FC). Only full-width katakana characters are supported. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public TokenFiltersDescriptor KuromojiStemmer(string name, Func selector = null) => + Assign(name, selector.InvokeOrDefault(new KuromojiStemmerTokenFilterDescriptor())); + + /// + /// The kuromoji_readingform token filter replaces the token with its reading form in either katakana or romaji. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public TokenFiltersDescriptor KuromojiReadingForm(string name, Func selector = null) => + Assign(name, selector.InvokeOrDefault(new KuromojiReadingFormTokenFilterDescriptor())); + + /// + /// The kuromoji_part_of_speech token filter removes tokens that match a set of part-of-speech tags. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public TokenFiltersDescriptor KuromojiPartOfSpeech(string name, Func selector = null) => + Assign(name, selector.InvokeOrDefault(new KuromojiPartOfSpeechTokenFilterDescriptor())); } } diff --git a/src/Nest/Analysis/Tokenizers/PatternTokenizer.cs b/src/Nest/Analysis/Tokenizers/PatternTokenizer.cs index cc011f77242..23df200c6c5 100644 --- a/src/Nest/Analysis/Tokenizers/PatternTokenizer.cs +++ b/src/Nest/Analysis/Tokenizers/PatternTokenizer.cs @@ -3,7 +3,7 @@ namespace Nest { /// - /// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. + /// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. /// public interface IPatternTokenizer : ITokenizer { @@ -18,7 +18,7 @@ public interface IPatternTokenizer : ITokenizer /// [JsonProperty("flags")] string Flags { get; set; } - + /// /// Which group to extract into tokens. Defaults to -1 (split). /// @@ -32,17 +32,17 @@ public class PatternTokenizer : TokenizerBase, IPatternTokenizer { public PatternTokenizer() { Type = "pattern"; } - /// + /// public string Pattern { get; set; } - /// + /// public string Flags { get; set; } - + /// public int? Group { get; set; } } /// - public class PatternTokenizerDescriptor + public class PatternTokenizerDescriptor : TokenizerDescriptorBase, IPatternTokenizer { protected override string Type => "pattern"; @@ -61,4 +61,4 @@ public class PatternTokenizerDescriptor public PatternTokenizerDescriptor Flags(string flags) => Assign(a => a.Flags = flags); } -} \ No newline at end of file +} diff --git a/src/Nest/Analysis/Tokenizers/TokenizerJsonConverter.cs b/src/Nest/Analysis/Tokenizers/TokenizerJsonConverter.cs index 0764e5af9f2..24ab45bf308 100644 --- a/src/Nest/Analysis/Tokenizers/TokenizerJsonConverter.cs +++ b/src/Nest/Analysis/Tokenizers/TokenizerJsonConverter.cs @@ -28,6 +28,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist case "standard": return o.ToObject(ElasticContractResolver.Empty); case "uax_url_email": return o.ToObject(ElasticContractResolver.Empty); case "whitespace": return o.ToObject(ElasticContractResolver.Empty); + case "kuromoji_tokenizer": return o.ToObject(ElasticContractResolver.Empty); } return null; } diff --git a/src/Nest/Analysis/Tokenizers/Tokenizers.cs b/src/Nest/Analysis/Tokenizers/Tokenizers.cs index a1573efd890..3cd460e77a7 100644 --- a/src/Nest/Analysis/Tokenizers/Tokenizers.cs +++ b/src/Nest/Analysis/Tokenizers/Tokenizers.cs @@ -44,14 +44,14 @@ public TokenizersDescriptor Keyword(string name, Func - /// A tokenizer of type letter that divides text at non-letters. That’s to say, it defines tokens as maximal strings of adjacent letters. + /// A tokenizer of type letter that divides text at non-letters. That’s to say, it defines tokens as maximal strings of adjacent letters. /// Note, this does a decent job for most European languages, but does a terrible job for some Asian languages, where words are not separated by spaces. /// public TokenizersDescriptor Letter(string name, Func selector) => Assign(name, selector?.Invoke(new LetterTokenizerDescriptor())); /// - /// A tokenizer of type lowercase that performs the function of Letter Tokenizer and Lower Case Token Filter together. + /// A tokenizer of type lowercase that performs the function of Letter Tokenizer and Lower Case Token Filter together. /// It divides text at non-letters and converts them to lower case. /// While it is functionally equivalent to the combination of Letter Tokenizer and Lower Case Token Filter, /// there is a performance advantage to doing the two tasks at once, hence this (redundant) implementation. @@ -72,13 +72,13 @@ public TokenizersDescriptor PathHierarchy(string name, Func - /// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. + /// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. /// public TokenizersDescriptor Pattern(string name, Func selector) => Assign(name, selector?.Invoke(new PatternTokenizerDescriptor())); /// - /// A tokenizer of type standard providing grammar based tokenizer that is a good tokenizer for most European language documents. + /// A tokenizer of type standard providing grammar based tokenizer that is a good tokenizer for most European language documents. /// The tokenizer implements the Unicode Text Segmentation algorithm, as specified in Unicode Standard Annex #29. /// public TokenizersDescriptor Standard(string name, Func selector = null) => @@ -95,5 +95,12 @@ public TokenizersDescriptor UaxEmailUrl(string name, Func public TokenizersDescriptor Whitespace(string name, Func selector = null) => Assign(name, selector.InvokeOrDefault(new WhitespaceTokenizerDescriptor())); + + /// + /// A tokenizer of type pattern that can flexibly separate text into terms via a regular expression. + /// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html + /// + public TokenizersDescriptor Kuromoji(string name, Func selector) => + Assign(name, selector?.Invoke(new KuromojiTokenizerDescriptor())); } } diff --git a/src/Nest/Nest.csproj b/src/Nest/Nest.csproj index 406cbbff471..2b2113dba2b 100644 --- a/src/Nest/Nest.csproj +++ b/src/Nest/Nest.csproj @@ -191,6 +191,15 @@ + + + + + + + + + @@ -214,8 +223,6 @@ - - diff --git a/src/Tests/Analysis/Analyzers/AnalyzerUsageTests.cs b/src/Tests/Analysis/Analyzers/AnalyzerUsageTests.cs index 326e5d5d9e1..5c9be365a8b 100644 --- a/src/Tests/Analysis/Analyzers/AnalyzerUsageTests.cs +++ b/src/Tests/Analysis/Analyzers/AnalyzerUsageTests.cs @@ -23,8 +23,8 @@ public class AnalyzerUsageTests : PromiseUsageTestBase> Fluent => FluentExample; + public static Func> FluentExample => s => s .Analysis(analysis => analysis .Analyzers(analyzers => analyzers @@ -103,12 +109,16 @@ public class AnalyzerUsageTests : PromiseUsageTestBase a + .Mode(KuromojiTokenizationMode.Search) + ) ) ); /** */ protected override IndexSettings Initializer => InitializerExample; + public static IndexSettings InitializerExample => new IndexSettings { @@ -116,29 +126,37 @@ public class AnalyzerUsageTests : PromiseUsageTestBaseb" }, type = "mapping" + }, + kmark = new + { + normalize_kanji = true, + normalize_kana = true, + type = "kuromoji_iteration_mark" } } } @@ -30,7 +36,7 @@ public class CharFilterUsageTests : PromiseUsageTestBase> Fluent => FluentExample; public static Func> FluentExample => s => s @@ -39,6 +45,7 @@ public class CharFilterUsageTests : PromiseUsageTestBase c.Pattern("x").Replacement("y")) .Mapping("mapped", c => c.Mappings("a=>b")) + .KuromojiIterationMark("kmark", c => c.NormalizeKana().NormalizeKanji()) ) ); @@ -54,7 +61,12 @@ public class CharFilterUsageTests : PromiseUsageTestBaseb"} } } + { "mapped", new MappingCharFilter { Mappings = new [] { "a=>b"} } }, + { "kmark", new KuromojiIterationMarkCharFilter() + { + NormalizeKana = true, + NormalizeKanji = true + } } } } }; diff --git a/src/Tests/Analysis/TokenFilters/TokenFilterUsageTests.cs b/src/Tests/Analysis/TokenFilters/TokenFilterUsageTests.cs index b44fc0e3704..9a72b85b83d 100644 --- a/src/Tests/Analysis/TokenFilters/TokenFilterUsageTests.cs +++ b/src/Tests/Analysis/TokenFilters/TokenFilterUsageTests.cs @@ -89,6 +89,22 @@ public class TokenFilterUsageTests : PromiseUsageTestBase> Fluent => FluentExample; public static Func> FluentExample => s => s @@ -339,6 +355,15 @@ public class TokenFilterUsageTests : PromiseUsageTestBase t + .StopTags("# verb-main:", "動詞-自立") + ) + .KuromojiReadingForm("kfr", t => t + .UseRomaji() + ) + .KuromojiStemmer("ks", t => t + .MinimumLength(4) + ) ) ); @@ -433,7 +458,10 @@ public class TokenFilterUsageTests : PromiseUsageTestBase> Fluent => FluentExample; + public static Func> FluentExample => s => s .Analysis(analysis => analysis .Tokenizers(tokenizer => tokenizer @@ -94,12 +103,19 @@ public class TokenizerUsageTests : PromiseUsageTestBase t.MaxTokenLength(12)) .Whitespace("whitespace") + .Kuromoji("kuromoji", t => t + .Mode(KuromojiTokenizationMode.Extended) + .DiscardPunctuation() + .NBestExamples("/箱根山-箱根/成田空港-成田/") + .NBestCost(1000) + ) ) ); /** */ protected override IndexSettings Initializer => InitializerExample; + public static IndexSettings InitializerExample => new IndexSettings { @@ -107,35 +123,52 @@ public class TokenizerUsageTests : PromiseUsageTestBase), typeof(FluentDictionary<,>) } }; diff --git a/src/Tests/Framework/EndpointTests/Clusters/WritableCluster.cs b/src/Tests/Framework/EndpointTests/Clusters/WritableCluster.cs index ad2ba60b651..a5f72614943 100644 --- a/src/Tests/Framework/EndpointTests/Clusters/WritableCluster.cs +++ b/src/Tests/Framework/EndpointTests/Clusters/WritableCluster.cs @@ -5,12 +5,16 @@ namespace Tests.Framework.Integration /// /// Use this cluster for api's that do writes. If they are however intrusive or long running consider IntrusiveOperationCluster instead. /// - [RequiresPlugin(ElasticsearchPlugin.MapperAttachments, ElasticsearchPlugin.IngestGeoIp, ElasticsearchPlugin.IngestAttachment)] + [RequiresPlugin( + ElasticsearchPlugin.MapperAttachments, + ElasticsearchPlugin.IngestGeoIp, + ElasticsearchPlugin.AnalysisKuromoji, + ElasticsearchPlugin.IngestAttachment + )] public class WritableCluster : ClusterBase { public override int MaxConcurrency => 4; - public override void Bootstrap() { var seeder = new Seeder(this.Node); diff --git a/src/Tests/Framework/ManagedElasticsearch/ElasticsearchPlugin.cs b/src/Tests/Framework/ManagedElasticsearch/ElasticsearchPlugin.cs index 182fe9429e5..c10ec7d7e7f 100644 --- a/src/Tests/Framework/ManagedElasticsearch/ElasticsearchPlugin.cs +++ b/src/Tests/Framework/ManagedElasticsearch/ElasticsearchPlugin.cs @@ -42,7 +42,10 @@ public enum ElasticsearchPlugin IngestGeoIp, [Moniker("ingest-attachment")] - IngestAttachment + IngestAttachment, + + [Moniker("analysis-kuromoji")] + AnalysisKuromoji } public static class ElasticsearchPluginExtensions @@ -71,6 +74,7 @@ public class ElasticsearchPluginCollection : KeyedCollection version >= new ElasticsearchVersion("5.0.0-alpha3")), new ElasticsearchPluginConfiguration(IngestAttachment, version => version >= new ElasticsearchVersion("5.0.0-alpha3")), + new ElasticsearchPluginConfiguration(AnalysisKuromoji), }; protected override ElasticsearchPlugin GetKeyForItem(ElasticsearchPluginConfiguration item) diff --git a/src/Tests/Framework/ManagedElasticsearch/FileSystem/TestRunnerFileSystem.cs b/src/Tests/Framework/ManagedElasticsearch/FileSystem/TestRunnerFileSystem.cs index 47863ca31bc..2330b66bae6 100644 --- a/src/Tests/Framework/ManagedElasticsearch/FileSystem/TestRunnerFileSystem.cs +++ b/src/Tests/Framework/ManagedElasticsearch/FileSystem/TestRunnerFileSystem.cs @@ -141,11 +141,11 @@ private void InstallPlugins() if (Directory.Exists(pluginFolder)) continue; Console.WriteLine($"Installing elasticsearch plugin: {plugin.Moniker} ..."); - var timeout = TimeSpan.FromSeconds(120); + var timeout = TimeSpan.FromSeconds(420); var handle = new ManualResetEvent(false); Task.Run(() => { - using (var p = new ObservableProcess(this.PluginBinary, "install", installParameter)) + using (var p = new ObservableProcess(this.PluginBinary, "install --batch", installParameter)) { var o = p.Start(); Console.WriteLine($"Calling: {this.PluginBinary} install {installParameter}");