Skip to content

Commit

Permalink
Add support for nori plugin (#3430)
Browse files Browse the repository at this point in the history
* add analysis-nori plugin to writable cluster and added nori_tokenizer
* add nori_part_of_speech token filter
* add nori analyzer
* add Nori() to AnalyzeTokenDescriptor, CodeStandard tests caught this
  • Loading branch information
Mpdreamz authored and russcam committed Oct 17, 2018
1 parent 2130ddf commit c7e51cb
Show file tree
Hide file tree
Showing 13 changed files with 248 additions and 3 deletions.
1 change: 1 addition & 0 deletions src/Nest/Analysis/Analyzers/AnalyzerJsonConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
case "simple": return o.ToObject<SimpleAnalyzer>(ElasticContractResolver.Empty);
case "fingerprint": return o.ToObject<FingerprintAnalyzer>(ElasticContractResolver.Empty);
case "kuromoji": return o.ToObject<KuromojiAnalyzer>(ElasticContractResolver.Empty);
case "nori": return o.ToObject<NoriAnalyzer>(ElasticContractResolver.Empty);
default:
if (o.Property("tokenizer") != null)
return o.ToObject<CustomAnalyzer>(ElasticContractResolver.Empty);
Expand Down
4 changes: 4 additions & 0 deletions src/Nest/Analysis/Analyzers/Analyzers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,5 +97,9 @@ public class AnalyzersDescriptor : IsADictionaryDescriptorBase<AnalyzersDescript
/// </summary>
public AnalyzersDescriptor Kuromoji(string name, Func<KuromojiAnalyzerDescriptor, IKuromojiAnalyzer> selector = null) =>
Assign(name, selector.InvokeOrDefault(new KuromojiAnalyzerDescriptor()));

/// <inheritdoc cref="INoriAnalyzer"/>
public AnalyzersDescriptor Nori(string name, Func<NoriAnalyzerDescriptor, INoriAnalyzer> selector) =>
Assign(name, selector?.Invoke(new NoriAnalyzerDescriptor()));
}
}
65 changes: 65 additions & 0 deletions src/Nest/Analysis/Analyzers/NoriAnalyzer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
///The nori analyzer consists of the following tokenizer and token filters:
///<para> - nori_tokenizer</para>
///<para> - nori_part_of_speech token filter</para>
///<para> - nori_readingform token filter</para>
///<para> - lowercase token filter</para>
/// </summary>
public interface INoriAnalyzer : IAnalyzer
{
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
[JsonProperty("decompound_mode")]
NoriDecompoundMode? DecompoundMode { get; set; }

/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
[JsonProperty("user_dictionary")]
string UserDictionary { get; set; }

/// <inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags"/>
[JsonProperty("stoptags")]
IEnumerable<string> StopTags { get; set; }
}

/// <inheritdoc cref="INoriAnalyzer"/>
public class NoriAnalyzer : AnalyzerBase, INoriAnalyzer
{
public NoriAnalyzer() : base("nori") {}

/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
public NoriDecompoundMode? DecompoundMode { get; set; }

/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
public string UserDictionary { get; set; }

///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
public IEnumerable<string> StopTags { get; set; }
}

/// <inheritdoc cref="INoriAnalyzer"/>
public class NoriAnalyzerDescriptor : AnalyzerDescriptorBase<NoriAnalyzerDescriptor, INoriAnalyzer>, INoriAnalyzer
{
protected override string Type => "nori";

NoriDecompoundMode? INoriAnalyzer.DecompoundMode { get; set; }
string INoriAnalyzer.UserDictionary { get; set; }
IEnumerable<string> INoriAnalyzer.StopTags { get; set; }

/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
public NoriAnalyzerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(a => a.DecompoundMode = mode);

/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
public NoriAnalyzerDescriptor UserDictionary(string path) => Assign(a => a.UserDictionary = path);

///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
public NoriAnalyzerDescriptor StopTags(IEnumerable<string> stopTags) => Assign(a => a.StopTags = stopTags);

///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
public NoriAnalyzerDescriptor StopTags(params string[] stopTags) => Assign(a => a.StopTags = stopTags);

}
}
38 changes: 38 additions & 0 deletions src/Nest/Analysis/TokenFilters/NoriPartOfSpeechTokenFilter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary> The nori_part_of_speech token filter removes tokens that match a set of part-of-speech tags. </summary>
public interface INoriPartOfSpeechTokenFilter : ITokenFilter
{
/// <summary> An array of part-of-speech tags that should be removed. </summary>
[JsonProperty("stoptags")]
IEnumerable<string> StopTags { get; set; }
}

/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/>
public class NoriPartOfSpeechTokenFilter : TokenFilterBase, INoriPartOfSpeechTokenFilter
{
public NoriPartOfSpeechTokenFilter() : base("nori_part_of_speech") { }

///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
public IEnumerable<string> StopTags { get; set; }
}
/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/>
public class NoriPartOfSpeechTokenFilterDescriptor
: TokenFilterDescriptorBase<NoriPartOfSpeechTokenFilterDescriptor, INoriPartOfSpeechTokenFilter>, INoriPartOfSpeechTokenFilter
{
protected override string Type => "nori_part_of_speech";

IEnumerable<string> INoriPartOfSpeechTokenFilter.StopTags { get; set; }

///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
public NoriPartOfSpeechTokenFilterDescriptor StopTags(IEnumerable<string> stopTags) => Assign(a => a.StopTags = stopTags);

///<inheritdoc cref="INoriPartOfSpeechTokenFilter.StopTags" />
public NoriPartOfSpeechTokenFilterDescriptor StopTags(params string[] stopTags) => Assign(a => a.StopTags = stopTags);

}

}
4 changes: 4 additions & 0 deletions src/Nest/Analysis/TokenFilters/TokenFilters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,10 @@ public TokenFiltersDescriptor() : base(new TokenFilters())
public TokenFiltersDescriptor IcuTransform(string name, Func<IcuTransformTokenFilterDescriptor, IIcuTransformTokenFilter> selector) =>
Assign(name, selector.Invoke(new IcuTransformTokenFilterDescriptor()));

/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/>
public TokenFiltersDescriptor NoriPartOfSpeech(string name, Func<NoriPartOfSpeechTokenFilterDescriptor, INoriPartOfSpeechTokenFilter> selector) =>
Assign(name, selector.Invoke(new NoriPartOfSpeechTokenFilterDescriptor()));

/// <summary>
/// A token filter of type multiplexer will emit multiple tokens at the same position, each version of the token
/// having been run through a different filter. Identical output tokens at the same position will be removed.
Expand Down
63 changes: 63 additions & 0 deletions src/Nest/Analysis/Tokenizers/NoriTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
using System.Runtime.Serialization;
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;

namespace Nest
{
/// <summary> The decompound mode determines how the tokenizer handles compound tokens. </summary>
[JsonConverter(typeof(StringEnumConverter))]
public enum NoriDecompoundMode
{
/// <summary> Decomposes compounds and discards the original form (default). </summary>
[EnumMember(Value="discard")]
Discard,
/// <summary> No decomposition for compounds </summary>
[EnumMember(Value="none")]
None,
/// <summary> Decomposes compounds and keeps the original form </summary>
[EnumMember(Value="mixed")]
Mixed
}

/// <summary> Tokenizer that ships with the analysis-nori plugin</summary>
public interface INoriTokenizer : ITokenizer
{
/// <summary>
/// The regular expression pattern, defaults to \W+.
/// </summary>
[JsonProperty("decompound_mode")]
NoriDecompoundMode? DecompoundMode { get; set; }

/// <summary>
/// The Nori tokenizer uses the mecab-ko-dic dictionary by default. A user_dictionary with custom nouns (NNG) may be appended to
/// the default dictionary. This property allows you to specify this file on disk
/// </summary>
[JsonProperty("user_dictionary")]
string UserDictionary { get; set; }
}

/// <inheritdoc cref="INoriTokenizer"/>
public class NoriTokenizer : TokenizerBase, INoriTokenizer
{
public NoriTokenizer() => this.Type = "nori_tokenizer";
/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
public NoriDecompoundMode? DecompoundMode { get; set; }
/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
public string UserDictionary { get; set; }
}
/// <inheritdoc cref="INoriTokenizer"/>
public class NoriTokenizerDescriptor
: TokenizerDescriptorBase<NoriTokenizerDescriptor, INoriTokenizer>, INoriTokenizer
{
protected override string Type => "nori_tokenizer";

NoriDecompoundMode? INoriTokenizer.DecompoundMode { get; set; }
string INoriTokenizer.UserDictionary { get; set; }

/// <inheritdoc cref="INoriTokenizer.DecompoundMode"/>
public NoriTokenizerDescriptor DecompoundMode(NoriDecompoundMode? mode) => Assign(a => a.DecompoundMode = mode);

/// <inheritdoc cref="INoriTokenizer.UserDictionary"/>
public NoriTokenizerDescriptor UserDictionary(string path) => Assign(a => a.UserDictionary = path);
}
}
4 changes: 4 additions & 0 deletions src/Nest/Analysis/Tokenizers/Tokenizers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ public class TokenizersDescriptor : IsADictionaryDescriptorBase<TokenizersDescri
public TokenizersDescriptor Icu(string name, Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
Assign(name, selector?.Invoke(new IcuTokenizerDescriptor()));

/// <inheritdoc cref="INoriTokenizer"/>
public TokenizersDescriptor Nori(string name, Func<NoriTokenizerDescriptor, INoriTokenizer> selector) =>
Assign(name, selector?.Invoke(new NoriTokenizerDescriptor()));

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
public TokenizersDescriptor CharGroup(string name, Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));
Expand Down
4 changes: 4 additions & 0 deletions src/Nest/Indices/Analyze/AnalyzeTokenFilters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,10 @@ public class AnalyzeTokenFiltersDescriptor : DescriptorPromiseBase<AnalyzeTokenF
public AnalyzeTokenFiltersDescriptor IcuTransform(Func<IcuTransformTokenFilterDescriptor, IIcuTransformTokenFilter> selector) =>
AssignIfNotNull(selector.Invoke(new IcuTransformTokenFilterDescriptor()));

/// <inheritdoc cref="INoriPartOfSpeechTokenFilter"/>
public AnalyzeTokenFiltersDescriptor NoriPartOfSpeech(Func<NoriPartOfSpeechTokenFilterDescriptor, INoriPartOfSpeechTokenFilter> selector) =>
AssignIfNotNull(selector.Invoke(new NoriPartOfSpeechTokenFilterDescriptor()));

///<inheritdoc cref="IMultiplexerTokenFilter"/>
public AnalyzeTokenFiltersDescriptor Multiplexer(Func<MultiplexerTokenFilterDescriptor, IMultiplexerTokenFilter> selector) =>
AssignIfNotNull(selector.Invoke(new MultiplexerTokenFilterDescriptor()));
Expand Down
4 changes: 4 additions & 0 deletions src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ public class AnalyzeTokenizersSelector : SelectorBase<ITokenizer>
public ITokenizer Icu(Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
(selector?.Invoke(new IcuTokenizerDescriptor()));

/// <inheritdoc cref="INoriTokenizer"/>
public ITokenizer Nori(Func<NoriTokenizerDescriptor, INoriTokenizer> selector) =>
selector.Invoke(new NoriTokenizerDescriptor());

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
public ITokenizer CharGroup(Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) => selector?.Invoke(new CharGroupTokenizerDescriptor());
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using Tests.Core.ManagedElasticsearch.NodeSeeders;
using Elastic.Managed.Ephemeral.Plugins;
using Tests.Core.ManagedElasticsearch.NodeSeeders;
using static Elastic.Managed.Ephemeral.Plugins.ElasticsearchPlugin;

namespace Tests.Core.ManagedElasticsearch.Clusters
Expand All @@ -12,7 +13,9 @@ public class WritableCluster : ClientTestClusterBase
AnalysisKuromoji,
AnalysisIcu,
AnalysisPhonetic,
MapperMurmur3
MapperMurmur3,
//TODO move this to elasticsearch-net abstractions
new ElasticsearchPlugin("analysis-nori", v => v >= "6.4.0")
)
{
MaxConcurrency = 4
Expand Down
25 changes: 25 additions & 0 deletions src/Tests/Tests/Analysis/Analyzers/AnalyzerTests.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
using System;
using Elastic.Xunit.XunitPlumbing;
using Nest;
using Tests.Analysis.TokenFilters;

namespace Tests.Analysis.Analyzers
{
Expand Down Expand Up @@ -197,5 +199,28 @@ public class KuromojuTests : AnalyzerAssertionBase<KuromojuTests>
};
}

[SkipVersion("<6.4.0", "analysis-nori plugin introduced in 6.4.0")]
public class NoriTests : AnalyzerAssertionBase<NoriTests>
{
public override string Name => "nori";
private readonly string[] _stopTags = {"NR", "SP"};
public override IAnalyzer Initializer => new NoriAnalyzer
{
StopTags = _stopTags,
DecompoundMode = NoriDecompoundMode.Mixed
};

public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
.StopTags(_stopTags)
.DecompoundMode(NoriDecompoundMode.Mixed)
);

public override object Json => new
{
type = "nori",
decompound_mode = "mixed",
stoptags =_stopTags
};
}
}
}
13 changes: 13 additions & 0 deletions src/Tests/Tests/Analysis/TokenFilters/TokenFilterTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -919,6 +919,19 @@ public class PhoneticTests : TokenFilterAssertionBase<PhoneticTests>

}

[SkipVersion("<6.4.0", "analysis-nori plugin introduced in 6.4.0")]
public class NoriPartOfSpeechTests : TokenFilterAssertionBase<NoriPartOfSpeechTests>
{
public override string Name => "nori_pos";
private readonly string[] _stopTags = {"NR", "SP"};

public override ITokenFilter Initializer => new NoriPartOfSpeechTokenFilter {StopTags = _stopTags};

public override FuncTokenFilters Fluent => (n, tf) => tf.NoriPartOfSpeech(n, t => t.StopTags(_stopTags));

public override object Json => new { type = "nori_part_of_speech", stoptags = _stopTags };
}

[SkipVersion("<6.4.0", "Introduced in 6.4.0")]
public class MultiplexerTests : TokenFilterAssertionBase<PhoneticTests>
{
Expand Down
19 changes: 18 additions & 1 deletion src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System;
using Elastic.Xunit.XunitPlumbing;
using Nest;

namespace Tests.Analysis.Tokenizers
Expand Down Expand Up @@ -204,6 +205,22 @@ public class StandardTests : TokenizerAssertionBase<StandardTests>
public override object Json => new {type = "standard"};
}

[SkipVersion("<6.4.0", "analysis-nori plugin introduced in 6.4.0")]
public class NoriTests : TokenizerAssertionBase<NoriTests>
{
public override string Name => "nori";
public override ITokenizer Initializer => new NoriTokenizer
{
DecompoundMode = NoriDecompoundMode.Mixed
};

public override FuncTokenizer Fluent => (n, t) => t.Nori(n, e => e
.DecompoundMode(NoriDecompoundMode.Mixed)
);

public override object Json => new {type = "nori_tokenizer", decompound_mode = "mixed"};
}

public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
{
private readonly string[] _chars = {"whitespace", "-", "\n"};
Expand Down

0 comments on commit c7e51cb

Please sign in to comment.