Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions src/Nest/Analysis/Tokenizers/SimplePatternSplitTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System.Runtime.Serialization;

namespace Nest
{
/// <summary>
/// The simple_pattern tokenizer uses a regular expression to capture matching text as terms.
/// </summary>
public interface ISimplePatternSplitTokenizer : ITokenizer
{
/// <summary>
/// Lucene regular expression, defaults to the empty string.
/// </summary>
[DataMember(Name = "pattern")]
string Pattern { get; set; }
}

/// <inheritdoc />
public class SimplePatternSplitTokenizer : TokenizerBase, ISimplePatternSplitTokenizer
{
public SimplePatternSplitTokenizer() => Type = "simple_pattern_split";

/// <inheritdoc />
public string Pattern { get; set; }
}

/// <inheritdoc />
public class SimplePatternSplitTokenizerDescriptor
: TokenizerDescriptorBase<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer>, ISimplePatternSplitTokenizer
{
protected override string Type => "simple_pattern_split";

string ISimplePatternSplitTokenizer.Pattern { get; set; }

/// <inheritdoc cref="ISimplePatternSplitTokenizer.Pattern" />
public SimplePatternSplitTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v);
}
}
41 changes: 41 additions & 0 deletions src/Nest/Analysis/Tokenizers/SimplePatternTokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Licensed to Elasticsearch B.V under one or more agreements.
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
// See the LICENSE file in the project root for more information

using System.Runtime.Serialization;

namespace Nest
{
/// <summary>
/// The simple_pattern tokenizer uses a regular expression to capture matching text as terms.
/// </summary>
public interface ISimplePatternTokenizer : ITokenizer
{
/// <summary>
/// Lucene regular expression, defaults to the empty string.
/// </summary>
[DataMember(Name = "pattern")]
string Pattern { get; set; }
}

/// <inheritdoc />
public class SimplePatternTokenizer : TokenizerBase, ISimplePatternTokenizer
{
public SimplePatternTokenizer() => Type = "simple_pattern";

/// <inheritdoc />
public string Pattern { get; set; }
}

/// <inheritdoc />
public class SimplePatternTokenizerDescriptor
: TokenizerDescriptorBase<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer>, ISimplePatternTokenizer
{
protected override string Type => "simple_pattern";

string ISimplePatternTokenizer.Pattern { get; set; }

/// <inheritdoc cref="ISimplePatternTokenizer.Pattern" />
public SimplePatternTokenizerDescriptor Pattern(string pattern) => Assign(pattern, (a, v) => a.Pattern = v);
}
}
8 changes: 8 additions & 0 deletions src/Nest/Analysis/Tokenizers/Tokenizers.cs
Original file line number Diff line number Diff line change
Expand Up @@ -132,5 +132,13 @@ public TokenizersDescriptor Nori(string name, Func<NoriTokenizerDescriptor, INor
/// >
public TokenizersDescriptor CharGroup(string name, Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
Assign(name, selector?.Invoke(new CharGroupTokenizerDescriptor()));

/// <inheritdoc cref="ISimplePatternTokenizer"/>>
public TokenizersDescriptor SimplePattern(string name, Func<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer> selector) =>
Assign(name, selector?.Invoke(new SimplePatternTokenizerDescriptor()));

/// <inheritdoc cref="ISimplePatternSplitTokenizer"/>>
public TokenizersDescriptor SimplePatternSplit(string name, Func<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer> selector) =>
Assign(name, selector?.Invoke(new SimplePatternSplitTokenizerDescriptor()));
}
}
9 changes: 8 additions & 1 deletion src/Nest/Indices/Analyze/AnalyzeTokenizersDescriptor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,15 @@ public ITokenizer Nori(Func<NoriTokenizerDescriptor, INoriTokenizer> selector) =
selector.Invoke(new NoriTokenizerDescriptor());

/// <inheritdoc cref="ICharGroupTokenizer.TokenizeOnCharacters" />
/// >
public ITokenizer CharGroup(Func<CharGroupTokenizerDescriptor, ICharGroupTokenizer> selector) =>
selector?.Invoke(new CharGroupTokenizerDescriptor());

/// <inheritdoc cref="ISimplePatternTokenizer"/>>
public ITokenizer SimplePattern(Func<SimplePatternTokenizerDescriptor, ISimplePatternTokenizer> selector) =>
selector?.Invoke(new SimplePatternTokenizerDescriptor());

/// <inheritdoc cref="ISimplePatternSplitTokenizer"/>>
public ITokenizer SimplePatternSplit(Func<SimplePatternSplitTokenizerDescriptor, ISimplePatternSplitTokenizer> selector) =>
selector?.Invoke(new SimplePatternSplitTokenizerDescriptor());
}
}
155 changes: 53 additions & 102 deletions tests/Tests/Analysis/Tokenizers/TokenizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,10 @@ public class EdgeNGramTests : TokenizerAssertionBase<EdgeNGramTests>

public override ITokenizer Initializer => new EdgeNGramTokenizer
{
MaxGram = 2,
MinGram = 1,
TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
};

public override object Json => new
{
min_gram = 1,
max_gram = 2,
token_chars = new[] { "digit", "letter" },
type = "edge_ngram"
};
public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "edge_ngram" };

public override string Name => "endgen";
}
Expand All @@ -50,10 +42,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCu

public override ITokenizer Initializer => new EdgeNGramTokenizer
{
MaxGram = 2,
MinGram = 1,
TokenChars = new[] { TokenChar.Custom },
CustomTokenChars = "+-_"
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_"
};

public override object Json => new
Expand All @@ -62,7 +51,7 @@ public class EdgeNGramCustomTokenCharsTests : TokenizerAssertionBase<EdgeNGramCu
max_gram = 2,
token_chars = new[] { "custom" },
custom_token_chars = "+-_",
type = "edge_ngram"
type = "edge_ngram"
};

public override string Name => "endgen_custom";
Expand All @@ -78,18 +67,10 @@ public class NGramTests : TokenizerAssertionBase<NGramTests>

public override ITokenizer Initializer => new NGramTokenizer
{
MaxGram = 2,
MinGram = 1,
TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Digit, TokenChar.Letter }
};

public override object Json => new
{
min_gram = 1,
max_gram = 2,
token_chars = new[] { "digit", "letter" },
type = "ngram"
};
public override object Json => new { min_gram = 1, max_gram = 2, token_chars = new[] { "digit", "letter" }, type = "ngram" };

public override string Name => "ng";
}
Expand All @@ -106,10 +87,7 @@ public class NGramCustomTokenCharsTests : TokenizerAssertionBase<NGramCustomToke

public override ITokenizer Initializer => new NGramTokenizer
{
MaxGram = 2,
MinGram = 1,
TokenChars = new[] { TokenChar.Custom },
CustomTokenChars = "+-_"
MaxGram = 2, MinGram = 1, TokenChars = new[] { TokenChar.Custom }, CustomTokenChars = "+-_"
};

public override object Json => new
Expand Down Expand Up @@ -164,16 +142,9 @@ public class IcuTests : TokenizerAssertionBase<IcuTests>
.RuleFiles(RuleFiles)
);

public override ITokenizer Initializer => new IcuTokenizer
{
RuleFiles = RuleFiles,
};
public override ITokenizer Initializer => new IcuTokenizer { RuleFiles = RuleFiles, };

public override object Json => new
{
rule_files = RuleFiles,
type = "icu_tokenizer"
};
public override object Json => new { rule_files = RuleFiles, type = "icu_tokenizer" };

public override string Name => "icu";
}
Expand All @@ -198,7 +169,7 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
DiscardPunctuation = true,
NBestExamples = Example,
NBestCost = 1000,
UserDictionaryRules = new [] { Inline }
UserDictionaryRules = new[] { Inline }
};

public override object Json => new
Expand All @@ -208,7 +179,7 @@ public class KuromojiTests : TokenizerAssertionBase<KuromojiTests>
nbest_cost = 1000,
nbest_examples = Example,
type = "kuromoji_tokenizer",
user_dictionary_rules = new [] { Inline }
user_dictionary_rules = new[] { Inline }
};

public override string Name => "kuro";
Expand All @@ -228,18 +199,9 @@ public class KuromojiDiscardCompoundTokenTests : TokenizerAssertionBase<Kuromoji
.DiscardCompoundToken()
);

public override ITokenizer Initializer => new KuromojiTokenizer
{
Mode = KuromojiTokenizationMode.Search,
DiscardCompoundToken = true,
};
public override ITokenizer Initializer => new KuromojiTokenizer { Mode = KuromojiTokenizationMode.Search, DiscardCompoundToken = true, };

public override object Json => new
{
discard_compound_token = true,
mode = "search",
type = "kuromoji_tokenizer",
};
public override object Json => new { discard_compound_token = true, mode = "search", type = "kuromoji_tokenizer", };

public override string Name => "kuro_discard_compound_token";
}
Expand All @@ -252,11 +214,7 @@ public class UaxTests : TokenizerAssertionBase<UaxTests>

public override ITokenizer Initializer => new UaxEmailUrlTokenizer { MaxTokenLength = 12 };

public override object Json => new
{
max_token_length = 12,
type = "uax_url_email"
};
public override object Json => new { max_token_length = 12, type = "uax_url_email" };

public override string Name => "uax";
}
Expand All @@ -269,20 +227,9 @@ public class PatternTests : TokenizerAssertionBase<PatternTests>
.Pattern(@"\W+")
);

public override ITokenizer Initializer => new PatternTokenizer
{
Flags = "CASE_INSENSITIVE",
Group = 1,
Pattern = @"\W+"
};
public override ITokenizer Initializer => new PatternTokenizer { Flags = "CASE_INSENSITIVE", Group = 1, Pattern = @"\W+" };

public override object Json => new
{
pattern = @"\W+",
flags = "CASE_INSENSITIVE",
group = 1,
type = "pattern"
};
public override object Json => new { pattern = @"\W+", flags = "CASE_INSENSITIVE", group = 1, type = "pattern" };

public override string Name => "pat";
}
Expand Down Expand Up @@ -312,10 +259,7 @@ public class NoriTests : TokenizerAssertionBase<NoriTests>
.DecompoundMode(NoriDecompoundMode.Mixed)
);

public override ITokenizer Initializer => new NoriTokenizer
{
DecompoundMode = NoriDecompoundMode.Mixed
};
public override ITokenizer Initializer => new NoriTokenizer { DecompoundMode = NoriDecompoundMode.Mixed };

public override object Json => new { type = "nori_tokenizer", decompound_mode = "mixed" };
public override string Name => "nori";
Expand All @@ -331,16 +275,14 @@ public class NoriWithUserDictionaryTests : TokenizerAssertionBase<NoriWithUserDi

public override ITokenizer Initializer => new NoriTokenizer
{
DecompoundMode = NoriDecompoundMode.Mixed,
UserDictionaryRules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
DecompoundMode = NoriDecompoundMode.Mixed, UserDictionaryRules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" }
};

public override object Json => new
{
type = "nori_tokenizer",
decompound_mode = "mixed",
user_dictionary_rules = new [] { "c++", "C샤프", "세종", "세종시 세종 시" }
type = "nori_tokenizer", decompound_mode = "mixed", user_dictionary_rules = new[] { "c++", "C샤프", "세종", "세종시 세종 시" }
};

public override string Name => "nori_userdictionary";
}

Expand All @@ -353,16 +295,9 @@ public class CharGroupTests : TokenizerAssertionBase<CharGroupTests>
.TokenizeOnCharacters(_chars)
);

public override ITokenizer Initializer => new CharGroupTokenizer
{
TokenizeOnCharacters = _chars
};
public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars };

public override object Json => new
{
tokenize_on_chars = _chars,
type = "char_group"
};
public override object Json => new { tokenize_on_chars = _chars, type = "char_group" };

public override string Name => "char_group";
}
Expand All @@ -377,18 +312,9 @@ public class CharGroupMaxTokenLengthTests : TokenizerAssertionBase<CharGroupMaxT
.MaxTokenLength(255)
);

public override ITokenizer Initializer => new CharGroupTokenizer
{
TokenizeOnCharacters = _chars,
MaxTokenLength = 255
};
public override ITokenizer Initializer => new CharGroupTokenizer { TokenizeOnCharacters = _chars, MaxTokenLength = 255 };

public override object Json => new
{
tokenize_on_chars = _chars,
type = "char_group",
max_token_length = 255
};
public override object Json => new { tokenize_on_chars = _chars, type = "char_group", max_token_length = 255 };

public override string Name => "char_group_max_token_length";
}
Expand All @@ -400,13 +326,38 @@ public class DiscardPunctuationTests : TokenizerAssertionBase<DiscardPunctuation
.DiscardPunctuation()
);

public override ITokenizer Initializer => new NoriTokenizer
{
DiscardPunctuation = true
};
public override ITokenizer Initializer => new NoriTokenizer { DiscardPunctuation = true };

public override object Json => new { type = "nori_tokenizer", discard_punctuation = true };
public override string Name => "nori-discard";
}

[SkipVersion("<7.7.0", "simple_pattern experimental until 7.7.0")]
public class SimplePatternTests : TokenizerAssertionBase<SimplePatternTests>
{
public override FuncTokenizer Fluent => (n, t) => t.SimplePattern(n, e => e
.Pattern(@"\W+")
);

public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" };

public override object Json => new { pattern = @"\W+", type = "simple_pattern" };

public override string Name => "simple-pattern";
}

[SkipVersion("<7.7.0", "simple_pattern_split experimental until 7.7.0")]
public class SimplePatternSplitTests : TokenizerAssertionBase<SimplePatternTests>
{
public override FuncTokenizer Fluent => (n, t) => t.SimplePatternSplit(n, e => e
.Pattern(@"\W+")
);

public override ITokenizer Initializer => new SimplePatternTokenizer { Pattern = @"\W+" };

public override object Json => new { pattern = @"\W+", type = "simple_pattern_split" };

public override string Name => "simple-pattern-split";
}
}
}