Skip to content

expose word delimiter graph token filter as per elastic/elasticsearch… #2747

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Nest/Analysis/TokenFilters/TokenFilterJsonConverter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public override object ReadJson(JsonReader reader, Type objectType, object exist
case "unique": return o.ToObject<UniqueTokenFilter>(ElasticContractResolver.Empty);
case "uppercase": return o.ToObject<UppercaseTokenFilter>(ElasticContractResolver.Empty);
case "word_delimiter": return o.ToObject<WordDelimiterTokenFilter>(ElasticContractResolver.Empty);
case "word_delimiter_graph": return o.ToObject<WordDelimiterGraphTokenFilter>(ElasticContractResolver.Empty);
case "fingerprint": return o.ToObject<FingerprintTokenFilter>(ElasticContractResolver.Empty);
case "kuromoji_readingform": return o.ToObject<KuromojiReadingFormTokenFilter>(ElasticContractResolver.Empty);
case "kuromoji_part_of_speech": return o.ToObject<KuromojiPartOfSpeechTokenFilter>(ElasticContractResolver.Empty);
Expand Down
7 changes: 7 additions & 0 deletions src/Nest/Analysis/TokenFilters/TokenFilters.cs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ public TokenFiltersDescriptor SynonymGraph(string name, Func<SynonymGraphTokenFi
public TokenFiltersDescriptor WordDelimiter(string name, Func<WordDelimiterTokenFilterDescriptor, IWordDelimiterTokenFilter> selector) =>
Assign(name, selector?.Invoke(new WordDelimiterTokenFilterDescriptor()));

/// <summary>
/// A token filter of type asciifolding that converts alphabetic, numeric, and symbolic Unicode characters which are
/// <para> not in the first 127 ASCII characters (the “Basic Latin” Unicode block) into their ASCII equivalents, if one exists.</para>
/// </summary>
public TokenFiltersDescriptor WordDelimiterGraph(string name, Func<WordDelimiterGraphTokenFilterDescriptor, IWordDelimiterGraphTokenFilter> selector) =>
Assign(name, selector?.Invoke(new WordDelimiterGraphTokenFilterDescriptor()));

/// <summary>
/// A token filter of type asciifolding that converts alphabetic, numeric, and symbolic Unicode characters which are
/// <para> not in the first 127 ASCII characters (the “Basic Latin” Unicode block) into their ASCII equivalents, if one exists.</para>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// Named word_delimiter, it Splits words into subwords and performs optional transformations on subword groups.
/// Unlike the word_delimiter this token filter named word_delimiter_graph correctly handles multi terms expansion at query time.
/// </summary>
public interface IWordDelimiterGraphTokenFilter : ITokenFilter
{
/// <summary>
/// If true causes parts of words to be generated: "PowerShot" ⇒ "Power" "Shot". Defaults to true.
/// </summary>
[JsonProperty("generate_word_parts")]
bool? GenerateWordParts { get; set; }

/// <summary>
/// If true causes number subwords to be generated: "500-42" ⇒ "500" "42". Defaults to true.
/// </summary>
[JsonProperty("generate_number_parts")]
bool? GenerateNumberParts { get; set; }

/// <summary>
/// If true causes maximum runs of word parts to be catenated: "wi-fi" ⇒ "wifi". Defaults to false.
/// </summary>
[JsonProperty("catenate_words")]
bool? CatenateWords { get; set; }

/// <summary>
/// If true causes maximum runs of number parts to be catenated: "500-42" ⇒ "50042". Defaults to false.
/// </summary>
[JsonProperty("catenate_numbers")]
bool? CatenateNumbers { get; set; }

/// <summary>
/// If true causes all subword parts to be catenated: "wi-fi-4000" ⇒ "wifi4000". Defaults to false.
/// </summary>
[JsonProperty("catenate_all")]
bool? CatenateAll { get; set; }

/// <summary>
/// If true causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards). Defaults to true.
/// </summary>
[JsonProperty("split_on_case_change")]
bool? SplitOnCaseChange { get; set; }

/// <summary>
/// If true includes original words in subwords: "500-42" ⇒ "500-42" "500" "42". Defaults to false.
/// </summary>
[JsonProperty("preserve_original")]
bool? PreserveOriginal { get; set; }

/// <summary>
/// If true causes "j2se" to be three tokens; "j" "2" "se". Defaults to true.
/// </summary>
[JsonProperty("split_on_numerics")]
bool? SplitOnNumerics { get; set; }

/// <summary>
/// If true causes trailing "'s" to be removed for each subword: "O’Neil’s" ⇒ "O", "Neil". Defaults to true.
/// </summary>
[JsonProperty("stem_english_possessive")]
bool? StemEnglishPossessive { get; set; }

/// <summary>
/// A list of protected words from being delimiter.
/// </summary>
[JsonProperty("protected_words")]
IEnumerable<string> ProtectedWords { get; set; }

/// <summary>
///protected_words_path which resolved to a file configured with protected words (one on each line).
/// Automatically resolves to config/ based location if exists.
/// </summary>
[JsonProperty("protected_words_path ")]
string ProtectedWordsPath { get; set; }

/// <summary>
/// A custom type mapping table
/// </summary>
[JsonProperty("type_table")]
IEnumerable<string> TypeTable { get; set; }

/// <summary>
/// A path to a custom type mapping table file
/// </summary>
[JsonProperty("type_table_path")]
string TypeTablePath { get; set; }

}

/// <inheritdoc/>
public class WordDelimiterGraphTokenFilter : TokenFilterBase, IWordDelimiterGraphTokenFilter
{
public WordDelimiterGraphTokenFilter() : base("word_delimiter_graph") { }

/// <inheritdoc/>
public bool? GenerateWordParts { get; set; }

/// <inheritdoc/>
public bool? GenerateNumberParts { get; set; }

/// <inheritdoc/>
public bool? CatenateWords { get; set; }

/// <inheritdoc/>
public bool? CatenateNumbers { get; set; }

/// <inheritdoc/>
public bool? CatenateAll { get; set; }

/// <inheritdoc/>
public bool? SplitOnCaseChange { get; set; }

/// <inheritdoc/>
public bool? PreserveOriginal { get; set; }

/// <inheritdoc/>
public bool? SplitOnNumerics { get; set; }

/// <inheritdoc/>
public bool? StemEnglishPossessive { get; set; }

/// <inheritdoc/>
public IEnumerable<string> ProtectedWords { get; set; }

/// <inheritdoc/>
public string ProtectedWordsPath { get; set; }

/// <inheritdoc/>
public IEnumerable<string> TypeTable { get; set; }

/// <inheritdoc/>
public string TypeTablePath { get; set; }
}

///<inheritdoc/>
public class WordDelimiterGraphTokenFilterDescriptor
: TokenFilterDescriptorBase<WordDelimiterGraphTokenFilterDescriptor, IWordDelimiterGraphTokenFilter>, IWordDelimiterGraphTokenFilter
{
protected override string Type => "word_delimiter_graph";

IEnumerable<string> IWordDelimiterGraphTokenFilter.ProtectedWords { get; set; }
string IWordDelimiterGraphTokenFilter.ProtectedWordsPath { get; set; }
IEnumerable<string> IWordDelimiterGraphTokenFilter.TypeTable { get; set; }
string IWordDelimiterGraphTokenFilter.TypeTablePath { get; set; }
bool? IWordDelimiterGraphTokenFilter.GenerateWordParts { get; set; }
bool? IWordDelimiterGraphTokenFilter.GenerateNumberParts { get; set; }
bool? IWordDelimiterGraphTokenFilter.CatenateWords { get; set; }
bool? IWordDelimiterGraphTokenFilter.CatenateNumbers { get; set; }
bool? IWordDelimiterGraphTokenFilter.CatenateAll { get; set; }
bool? IWordDelimiterGraphTokenFilter.SplitOnCaseChange { get; set; }
bool? IWordDelimiterGraphTokenFilter.PreserveOriginal { get; set; }
bool? IWordDelimiterGraphTokenFilter.SplitOnNumerics { get; set; }
bool? IWordDelimiterGraphTokenFilter.StemEnglishPossessive { get; set; }

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor GenerateWordParts(bool? generateWordParts = true) => Assign(a => a.GenerateWordParts = generateWordParts);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor GenerateNumberParts(bool? generateNumberParts = true) => Assign(a => a.GenerateNumberParts = generateNumberParts);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor CatenateWords(bool? catenateWords = true) => Assign(a => a.CatenateWords = catenateWords);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor CatenateNumbers(bool? catenateNumbers = true) => Assign(a => a.CatenateNumbers = catenateNumbers);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor CatenateAll(bool? catenateAll = true) => Assign(a => a.CatenateAll = catenateAll);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor SplitOnCaseChange(bool? split = true) => Assign(a => a.SplitOnCaseChange = split);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor SplitOnNumerics(bool? split = true) => Assign(a => a.SplitOnNumerics = split);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor PreserveOriginal(bool? preserve = true) => Assign(a => a.PreserveOriginal = preserve);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor StemEnglishPossessive(bool? stem = true) => Assign(a => a.StemEnglishPossessive = stem);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor ProtectedWords(IEnumerable<string> protectedWords) => Assign(a => a.ProtectedWords = protectedWords);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor ProtectedWords(params string[] protectedWords) => Assign(a => a.ProtectedWords = protectedWords);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor ProtectedWordsPath(string path) => Assign(a => a.ProtectedWordsPath = path);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor TypeTable(IEnumerable<string> typeTable) => Assign(a => a.TypeTable = typeTable);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor TypeTable(params string[] typeTable) => Assign(a => a.TypeTable = typeTable);

///<inheritdoc/>
public WordDelimiterGraphTokenFilterDescriptor TypeTablePath(string path) => Assign(a => a.TypeTablePath = path);

}

}
2 changes: 1 addition & 1 deletion src/Nest/Nest.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@
<PackageReference Include="System.Linq.Queryable" Version="4.0.1" />
</ItemGroup>
<!--<Import Project="..\..\.paket\Paket.Restore.targets" />-->
</Project>
</Project>
41 changes: 41 additions & 0 deletions src/Tests/Analysis/TokenFilters/TokenFilterUsageTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,20 @@ public class TokenFilterUsageTests : PromiseUsageTestBase<IIndexSettings, IndexS
split_on_numerics = true,
stem_english_possessive = true,
protected_words = new[] {"x", "y", "z"}
},
wdg = new
{
type = "word_delimiter_graph",
generate_word_parts = true,
generate_number_parts = true,
catenate_words = true,
catenate_numbers = true,
catenate_all = true,
split_on_case_change = true,
preserve_original = true,
split_on_numerics = true,
stem_english_possessive = true,
protected_words = new[] {"x", "y", "z"}
}
}
}
Expand Down Expand Up @@ -403,6 +417,18 @@ public class TokenFilterUsageTests : PromiseUsageTestBase<IIndexSettings, IndexS
.SplitOnNumerics()
.StemEnglishPossessive()
)
.WordDelimiterGraph("wdg", t => t
.CatenateAll()
.CatenateNumbers()
.CatenateWords()
.GenerateNumberParts()
.GenerateWordParts()
.PreserveOriginal()
.ProtectedWords("x", "y", "z")
.SplitOnCaseChange()
.SplitOnNumerics()
.StemEnglishPossessive()
)
.KuromojiPartOfSpeech("kpos", t => t
.StopTags("# verb-main:", "動詞-自立")
)
Expand Down Expand Up @@ -549,6 +575,21 @@ public class TokenFilterUsageTests : PromiseUsageTestBase<IIndexSettings, IndexS
StemEnglishPossessive = true
}
},
{
"wdg", new WordDelimiterGraphTokenFilter
{
CatenateAll = true,
CatenateNumbers = true,
CatenateWords = true,
GenerateNumberParts = true,
GenerateWordParts = true,
PreserveOriginal = true,
ProtectedWords = new[] {"x", "y", "z"},
SplitOnCaseChange = true,
SplitOnNumerics = true,
StemEnglishPossessive = true
}
},
{"kpos", new KuromojiPartOfSpeechTokenFilter {StopTags = new[] {"# verb-main:", "動詞-自立"}}},
{"kfr", new KuromojiReadingFormTokenFilter {UseRomaji = true}},
{"ks", new KuromojiStemmerTokenFilter {MinimumLength = 4}},
Expand Down