Skip to content

add support for the char_group tokenizer #3427

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 17, 2018
48 changes: 48 additions & 0 deletions src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// A tokenizer that breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful
/// for cases where a simple custom tokenization is desired, and the overhead of use of <see cref="PatternTokenizer"/> is not acceptable.
/// </summary>
public interface ICharGroupTokenizer : ITokenizer
{
/// <summary>
/// A list containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a
/// new token is started. This accepts either single characters like eg. -, or character groups: whitespace, letter, digit,
/// punctuation, symbol.
/// </summary>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the documentation should be

A list containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a
new token is started. This accepts either single characters like eg. -, or character groups: whitespace, letter, digit,
punctuation, symbol.

[JsonProperty("tokenize_on_chars")]
IEnumerable<string> TokenizeOnCharacters { get; set; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

specialized type that takes a union of enum and char? string is no doubt easier to use.

}

/// <inheritdoc cref="ICharGroupTokenizer"/>
public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer
{
internal const string TokenizerType = "char_group";

public CharGroupTokenizer() => this.Type = TokenizerType;

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
public IEnumerable<string> TokenizeOnCharacters { get; set; }
}

/// <inheritdoc cref="ICharGroupTokenizer"/>
public class CharGroupTokenizerDescriptor
: TokenizerDescriptorBase<CharGroupTokenizerDescriptor, ICharGroupTokenizer>, ICharGroupTokenizer
{
protected override string Type => CharGroupTokenizer.TokenizerType;

IEnumerable<string> ICharGroupTokenizer.TokenizeOnCharacters { get; set; }

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) =>
Assign(a => a.TokenizeOnCharacters = characters);

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable<string> characters) =>
Assign(a => a.TokenizeOnCharacters = characters);
}
}
4 changes: 4 additions & 0 deletions src/Nest/Analysis/Tokenizers/Tokenizers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,9 @@ public TokenizersDescriptor Kuromoji(string name, Func<KuromojiTokenizerDescript
/// </summary>
public TokenizersDescriptor Icu(string name, Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
Assign(name, selector?.Invoke(new IcuTokenizerDescriptor()));

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
public TokenizersDescriptor CharGroup(string name, Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));
}
}
3 changes: 3 additions & 0 deletions src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,5 +91,8 @@ public ITokenizer Kuromoji(Func<KuromojiTokenizerDescriptor, IKuromojiTokenizer>
/// </summary>
public ITokenizer Icu(Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
(selector?.Invoke(new IcuTokenizerDescriptor()));

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
public ITokenizer CharGroup(Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) => selector?.Invoke(new CharGroupTokenizerDescriptor());
}
}
20 changes: 20 additions & 0 deletions src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -203,5 +203,25 @@ public class StandardTests : TokenizerAssertionBase<StandardTests>

public override object Json => new {type = "standard"};
}

public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
{
private readonly string[] _chars = {"whitespace", "-", "\n"};
public override string Name => "uax";
public override ITokenizer Initializer => new CharGroupTokenizer
{
TokenizeOnCharacters = _chars
};

public override FuncTokenizer Fluent => (n, t) => t.CharGroup(n, e => e
.TokenizeOnCharacters(_chars)
);

public override object Json => new
{
tokenize_on_chars = _chars,
type = "char_group"
};
}
}
}