Skip to content

Commit

Permalink
Add support for the char_group tokenizer (#3427)
Browse files Browse the repository at this point in the history
(cherry picked from commit 9ab4384)
  • Loading branch information
Mpdreamz authored and russcam committed Oct 26, 2018
1 parent 89f9e5d commit d88b057
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 0 deletions.
48 changes: 48 additions & 0 deletions src/Nest/Analysis/Tokenizers/CharGroupTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
using System.Collections.Generic;
using Newtonsoft.Json;

namespace Nest
{
/// <summary>
/// A tokenizer that breaks text into terms whenever it encounters a character which is in a defined set. It is mostly useful
/// for cases where a simple custom tokenization is desired, and the overhead of use of <see cref="PatternTokenizer"/> is not acceptable.
/// </summary>
public interface ICharGroupTokenizer : ITokenizer
{
/// <summary>
/// A list containing a list of characters to tokenize the string on. Whenever a character from this list is encountered, a
/// new token is started. This accepts either single characters like eg. -, or character groups: whitespace, letter, digit,
/// punctuation, symbol.
/// </summary>
[JsonProperty("tokenize_on_chars")]
IEnumerable<string> TokenizeOnCharacters { get; set; }
}

/// <inheritdoc cref="ICharGroupTokenizer"/>
public class CharGroupTokenizer : TokenizerBase, ICharGroupTokenizer
{
internal const string TokenizerType = "char_group";

public CharGroupTokenizer() => this.Type = TokenizerType;

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
public IEnumerable<string> TokenizeOnCharacters { get; set; }
}

/// <inheritdoc cref="ICharGroupTokenizer"/>
public class CharGroupTokenizerDescriptor
: TokenizerDescriptorBase<CharGroupTokenizerDescriptor, ICharGroupTokenizer>, ICharGroupTokenizer
{
protected override string Type => CharGroupTokenizer.TokenizerType;

IEnumerable<string> ICharGroupTokenizer.TokenizeOnCharacters { get; set; }

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
public CharGroupTokenizerDescriptor TokenizeOnCharacters(params string[] characters) =>
Assign(a => a.TokenizeOnCharacters = characters);

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>
public CharGroupTokenizerDescriptor TokenizeOnCharacters(IEnumerable<string> characters) =>
Assign(a => a.TokenizeOnCharacters = characters);
}
}
4 changes: 4 additions & 0 deletions src/Nest/Analysis/Tokenizers/Tokenizers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -112,5 +112,9 @@ public TokenizersDescriptor Kuromoji(string name, Func<KuromojiTokenizerDescript
/// </summary>
public TokenizersDescriptor Icu(string name, Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
Assign(name, selector?.Invoke(new IcuTokenizerDescriptor()));

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
public TokenizersDescriptor CharGroup(string name, Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));
}
}
3 changes: 3 additions & 0 deletions src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,5 +91,8 @@ public ITokenizer Kuromoji(Func<KuromojiTokenizerDescriptor, IKuromojiTokenizer>
/// </summary>
public ITokenizer Icu(Func<IcuTokenizerDescriptor, IIcuTokenizer> selector) =>
(selector?.Invoke(new IcuTokenizerDescriptor()));

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters"/>>
public ITokenizer CharGroup(Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) => selector?.Invoke(new CharGroupTokenizerDescriptor());
}
}
20 changes: 20 additions & 0 deletions src/Tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -203,5 +203,25 @@ public class StandardTests : TokenizerAssertionBase<StandardTests>

public override object Json => new {type = "standard"};
}

public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
{
private readonly string[] _chars = {"whitespace", "-", "\n"};
public override string Name => "uax";
public override ITokenizer Initializer => new CharGroupTokenizer
{
TokenizeOnCharacters = _chars
};

public override FuncTokenizer Fluent => (n, t) => t.CharGroup(n, e => e
.TokenizeOnCharacters(_chars)
);

public override object Json => new
{
tokenize_on_chars = _chars,
type = "char_group"
};
}
}
}

0 comments on commit d88b057

Please sign in to comment.