Skip to content

Add support for the kuromoji analysis plugin #2557

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jan 29, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/Nest/Analysis/Analysis.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ public class Analysis : IAnalysis
public ICharFilters CharFilters { get; set; }
public ITokenFilters TokenFilters { get; set; }
public ITokenizers Tokenizers { get; set; }

}

public class AnalysisDescriptor : DescriptorBase<AnalysisDescriptor, IAnalysis>, IAnalysis
Expand All @@ -44,4 +43,4 @@ public AnalysisDescriptor Tokenizers(Func<TokenizersDescriptor, IPromise<ITokeni
Assign(a => a.Tokenizers = selector?.Invoke(new TokenizersDescriptor())?.Value);

}
}
}
1 change: 1 addition & 0 deletions src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
case "whitespace": return o.ToObject<WhitespaceAnalyzer>(ElasticContractResolver.Empty);
case "simple": return o.ToObject<SimpleAnalyzer>(ElasticContractResolver.Empty);
case "fingerprint": return o.ToObject<FingerprintAnalyzer>(ElasticContractResolver.Empty);
case "kuromoji": return o.ToObject<KuromojiAnalyzer>(ElasticContractResolver.Empty);
default:
if (o.Property("tokenizer") != null)
return o.ToObject<CustomAnalyzer>(ElasticContractResolver.Empty);
Expand Down
6 changes: 6 additions & 0 deletions src/Nest/Analysis/Analyzers/Analyzers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,5 +91,11 @@ public AnalyzersDescriptor Whitespace(string name, Func<WhitespaceAnalyzerDescri
public AnalyzersDescriptor Fingerprint(string name, Func<FingerprintAnalyzerDescriptor, IFingerprintAnalyzer> selector = null) =>
Assign(name, selector.InvokeOrDefault(new FingerprintAnalyzerDescriptor()));

/// <summary>
/// An analyzer tailored for japanese that is bootstrapped with defaults.
/// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html
/// </summary>
public AnalyzersDescriptor Kuromoji(string name, Func<KuromojiAnalyzerDescriptor, IKuromojiAnalyzer> selector = null) =>
Assign(name, selector.InvokeOrDefault(new KuromojiAnalyzerDescriptor()));
}
}
1 change: 1 addition & 0 deletions src/Nest/Analysis/CharFilters/CharFilterJsonConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
case "html_strip": return o.ToObject<HtmlStripCharFilter>(ElasticContractResolver.Empty);
case "mapping": return o.ToObject<MappingCharFilter>(ElasticContractResolver.Empty);
case "pattern_replace": return o.ToObject<PatternReplaceCharFilter>(ElasticContractResolver.Empty);
case "kuromoji_iteration_mark": return o.ToObject<KuromojiIterationMarkCharFilter>(ElasticContractResolver.Empty);
}
return null;
}
Expand Down
11 changes: 9 additions & 2 deletions src/Nest/Analysis/CharFilters/CharFilters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ public class CharFiltersDescriptor : IsADictionaryDescriptorBase<CharFiltersDesc
public CharFiltersDescriptor() : base(new CharFilters()) { }

public CharFiltersDescriptor UserDefined(string name, ICharFilter analyzer) => Assign(name, analyzer);

/// <summary>
/// The pattern_replace char filter allows the use of a regex to manipulate the characters in a string before analysis.
/// The pattern_replace char filter allows the use of a regex to manipulate the characters in a string before analysis.
/// </summary>
public CharFiltersDescriptor PatternReplace(string name, Func<PatternReplaceCharFilterDescriptor, IPatternReplaceCharFilter> selector) =>
Assign(name, selector?.Invoke(new PatternReplaceCharFilterDescriptor()));
Expand All @@ -43,5 +43,12 @@ public CharFiltersDescriptor HtmlStrip(string name, Func<HtmlStripCharFilterDesc
public CharFiltersDescriptor Mapping(string name, Func<MappingCharFilterDescriptor, IMappingCharFilter> selector) =>
Assign(name, selector?.Invoke(new MappingCharFilterDescriptor()));

/// <summary>
/// The kuromoji_iteration_mark normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
/// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html
/// </summary>
public CharFiltersDescriptor KuromojiIterationMark(string name, Func<KuromojiIterationMarkCharFilterDescriptor, IKuromojiIterationMarkCharFilter> selector = null) =>
Assign(name, selector?.InvokeOrDefault(new KuromojiIterationMarkCharFilterDescriptor()));

}
}
43 changes: 43 additions & 0 deletions src/Nest/Analysis/Plugins/Kuromoji/KuromojiAnalyzer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// An analyzer tailored for japanese that is bootstrapped with defaults.
/// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html
/// </summary>
public interface IKuromojiAnalyzer : IAnalyzer
{
[JsonProperty("mode")]
KuromojiTokenizationMode? Mode { get; set; }

[JsonProperty("user_dictionary")]
string UserDictionary { get; set; }
}

/// <inheritdoc/>
public class KuromojiAnalyzer : AnalyzerBase, IKuromojiAnalyzer
{
public KuromojiAnalyzer() : base("kuromoji") {}

public KuromojiTokenizationMode? Mode { get; set; }

public string UserDictionary { get; set; }
}

/// <inheritdoc/>
public class KuromojiAnalyzerDescriptor :
AnalyzerDescriptorBase<KuromojiAnalyzerDescriptor, IKuromojiAnalyzer>, IKuromojiAnalyzer
{
protected override string Type => "kuromoji";

KuromojiTokenizationMode? IKuromojiAnalyzer.Mode { get; set; }
string IKuromojiAnalyzer.UserDictionary { get; set; }

public KuromojiAnalyzerDescriptor Mode(KuromojiTokenizationMode? mode) => Assign(a => a.Mode = mode);

public KuromojiAnalyzerDescriptor UserDictionary(string userDictionary) => Assign(a => a.UserDictionary = userDictionary);

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// The kuromoji_iteration_mark normalizes Japanese horizontal iteration marks (odoriji) to their expanded form.
/// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html
/// </summary>
public interface IKuromojiIterationMarkCharFilter : ICharFilter
{
[JsonProperty("normalize_kanji")]
bool? NormalizeKanji { get; set; }

[JsonProperty("normalize_kana")]
bool? NormalizeKana { get; set; }
}
/// <inheritdoc/>
public class KuromojiIterationMarkCharFilter : CharFilterBase, IKuromojiIterationMarkCharFilter
{
public KuromojiIterationMarkCharFilter() : base("kuromoji_iteration_mark") { }

/// <inheritdoc/>
public bool? NormalizeKanji { get; set; }

/// <inheritdoc/>
public bool? NormalizeKana { get; set; }
}

/// <inheritdoc/>
public class KuromojiIterationMarkCharFilterDescriptor
: CharFilterDescriptorBase<KuromojiIterationMarkCharFilterDescriptor, IKuromojiIterationMarkCharFilter>, IKuromojiIterationMarkCharFilter
{
protected override string Type => "kuromoji_iteration_mark";
bool? IKuromojiIterationMarkCharFilter.NormalizeKanji { get; set; }
bool? IKuromojiIterationMarkCharFilter.NormalizeKana { get; set; }

/// <inheritdoc/>
public KuromojiIterationMarkCharFilterDescriptor NormalizeKanji(bool? normalize = true) =>
Assign(a => a.NormalizeKanji = normalize);

/// <inheritdoc/>
public KuromojiIterationMarkCharFilterDescriptor NormalizeKana(bool? normalize = true) =>
Assign(a => a.NormalizeKana = normalize);

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// The kuromoji_part_of_speech token filter removes tokens that match a set of part-of-speech tags.
/// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html
/// </summary>
public interface IKuromojiPartOfSpeechTokenFilter : ITokenFilter
{
/// <summary>
/// An array of part-of-speech tags that should be removed. It defaults to the stoptags.txt file embedded
/// in the lucene-analyzer-kuromoji.jar.
/// </summary>
[JsonProperty("stoptags")]
IEnumerable<string> StopTags { get; set; }
}

/// <inheritdoc/>
public class KuromojiPartOfSpeechTokenFilter : TokenFilterBase, IKuromojiPartOfSpeechTokenFilter
{
public KuromojiPartOfSpeechTokenFilter() : base("kuromoji_part_of_speech") { }

/// <inheritdoc/>
public IEnumerable<string> StopTags { get; set; }
}

///<inheritdoc/>
public class KuromojiPartOfSpeechTokenFilterDescriptor
: TokenFilterDescriptorBase<KuromojiPartOfSpeechTokenFilterDescriptor, IKuromojiPartOfSpeechTokenFilter>, IKuromojiPartOfSpeechTokenFilter
{
protected override string Type => "kuromoji_part_of_speech";

IEnumerable<string> IKuromojiPartOfSpeechTokenFilter.StopTags { get; set; }

///<inheritdoc/>
public KuromojiPartOfSpeechTokenFilterDescriptor StopTags(IEnumerable<string> stopTags) => Assign(a => a.StopTags = stopTags);

///<inheritdoc/>
public KuromojiPartOfSpeechTokenFilterDescriptor StopTags(params string[] stopTags) => Assign(a => a.StopTags = stopTags);

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// The kuromoji_readingform token filter replaces the token with its reading form in either katakana or romaji.
/// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html
/// </summary>
public interface IKuromojiReadingFormTokenFilter : ITokenFilter
{
/// <summary>
/// Whether romaji reading form should be output instead of katakana. Defaults to false.
/// </summary>
[JsonProperty("use_romaji")]
bool? UseRomaji { get; set; }
}

/// <inheritdoc/>
public class KuromojiReadingFormTokenFilter : TokenFilterBase, IKuromojiReadingFormTokenFilter
{
public KuromojiReadingFormTokenFilter() : base("kuromoji_readingform") { }

/// <inheritdoc/>
public bool? UseRomaji { get; set; }
}

///<inheritdoc/>
public class KuromojiReadingFormTokenFilterDescriptor
: TokenFilterDescriptorBase<KuromojiReadingFormTokenFilterDescriptor, IKuromojiReadingFormTokenFilter>, IKuromojiReadingFormTokenFilter
{
protected override string Type => "kuromoji_readingform";

bool? IKuromojiReadingFormTokenFilter.UseRomaji { get; set; }

///<inheritdoc/>
public KuromojiReadingFormTokenFilterDescriptor UseRomaji(bool? useRomaji = true) => Assign(a => a.UseRomaji = useRomaji);


}

}
41 changes: 41 additions & 0 deletions src/Nest/Analysis/Plugins/Kuromoji/KuromojiStemmerTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
using System;
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// The kuromoji_stemmer token filter normalizes common katakana spelling variations ending in a
/// long sound character by removing this character (U+30FC). Only full-width katakana characters are supported.
/// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html
/// </summary>
public interface IKuromojiStemmerTokenFilter : ITokenFilter
{
/// <summary>
/// Katakana words shorter than the minimum length are not stemmed (default is 4).
/// </summary>
[JsonProperty("minimum_length")]
int? MinimumLength { get; set; }
}

/// <inheritdoc/>
public class KuromojiStemmerTokenFilter : TokenFilterBase, IKuromojiStemmerTokenFilter
{
public KuromojiStemmerTokenFilter() : base("kuromoji_stemmer") { }

/// <inheritdoc/>
public int? MinimumLength { get; set; }
}

///<inheritdoc/>
public class KuromojiStemmerTokenFilterDescriptor
: TokenFilterDescriptorBase<KuromojiStemmerTokenFilterDescriptor, IKuromojiStemmerTokenFilter>, IKuromojiStemmerTokenFilter
{
protected override string Type => "kuromoji_stemmer";

int? IKuromojiStemmerTokenFilter.MinimumLength { get; set; }

///<inheritdoc/>
public KuromojiStemmerTokenFilterDescriptor MinimumLength(int? minimumLength) => Assign(a => a.MinimumLength = minimumLength);
}
}
32 changes: 32 additions & 0 deletions src/Nest/Analysis/Plugins/Kuromoji/KuromojiTokenizationMode.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
using System.Runtime.Serialization;
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;

namespace Nest
{
/// <summary>
/// The tokenization mode determines how the tokenizer handles compound and unknown words.
/// Part of the `analysis-kuromoji` plugin: https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-kuromoji.html
/// </summary>
[JsonConverter(typeof(StringEnumConverter))]
public enum KuromojiTokenizationMode
{
/// <summary>
/// Normal segmentation, no decomposition for compounds
/// </summary>
[EnumMember(Value = "normal")]
Normal,
/// <summary>
/// Segmentation geared towards search. This includes a decompounding process for long nouns,
/// also including the full compound token as a synonym.
/// </summary>
[EnumMember(Value = "search")]
Search,
/// <summary>
/// Extended mode outputs unigrams for unknown words.
/// </summary>
[EnumMember(Value = "extended")]
Extended

}
}
Loading