Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove nGram and edgeNGram token filter names #38911

Merged
merged 5 commits into from
Feb 15, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[[analysis-edgengram-tokenfilter]]
=== Edge NGram Token Filter

A token filter of type `edgeNGram`.
A token filter of type `edge_ngram`.
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These doc-changes should go back to 7.0 as well.


The following are settings that can be set for a `edgeNGram` token
The following are settings that can be set for a `edge_ngram` token
filter type:

[cols="<,<",options="header",]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
[[analysis-ngram-tokenfilter]]
=== NGram Token Filter

A token filter of type `nGram`.
A token filter of type `ngram`.

The following are settings that can be set for a `nGram` token filter
The following are settings that can be set for a `ngram` token filter
type:

[cols="<,<",options="header",]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -414,14 +414,6 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, false, input ->
new EdgeNGramTokenFilter(input, 1)));
filters.add(PreConfiguredTokenFilter.singletonWithVersion("edgeNGram", false, false, (reader, version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_6_4_0)) {
deprecationLogger.deprecatedAndMaybeLog("edgeNGram_deprecation",
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [edge_ngram] instead.");
}
return new EdgeNGramTokenFilter(reader, 1);
}));
filters.add(PreConfiguredTokenFilter.singleton("elision", true,
input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
Expand All @@ -438,14 +430,6 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
filters.add(PreConfiguredTokenFilter.singletonWithVersion("nGram", false, false, (reader, version) -> {
if (version.onOrAfter(org.elasticsearch.Version.V_6_4_0)) {
deprecationLogger.deprecatedAndMaybeLog("nGram_deprecation",
"The [nGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [ngram] instead.");
}
return new NGramTokenFilter(reader, 1, 2, false);
}));
filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
filters.put("delimited_payload", org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory.class);
filters.put("dutch_stem", SnowballPorterFilterFactory.class);
filters.put("edge_ngram", null);
filters.put("edgeNGram", null);
filters.put("elision", null);
filters.put("french_stem", SnowballPorterFilterFactory.class);
filters.put("german_stem", null);
Expand All @@ -197,7 +196,6 @@ protected Map<String, Class<?>> getPreConfiguredTokenFilters() {
filters.put("length", null);
filters.put("limit", LimitTokenCountFilterFactory.class);
filters.put("ngram", null);
filters.put("nGram", null);
filters.put("persian_normalization", null);
filters.put("porter_stem", null);
filters.put("reverse", ReverseStringFilterFactory.class);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,107 +20,21 @@
package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.NamedAnalyzer;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.elasticsearch.test.VersionUtils;

import java.io.IOException;
import java.io.StringReader;
import java.util.Map;

public class CommonAnalysisPluginTests extends ESTestCase {

/**
* Check that the deprecated name "nGram" issues a deprecation warning for indices created since 6.3.0
*/
public void testNGramDeprecationWarning() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, Version.CURRENT))
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
assertWarnings(
"The [nGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [ngram] instead.");
}
}

/**
* Check that the deprecated name "nGram" does NOT issues a deprecation warning for indices created before 6.4.0
*/
public void testNGramNoDeprecationWarningPre6_4() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, Version.V_6_3_0))
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("nGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
}
}

/**
* Check that the deprecated name "edgeNGram" issues a deprecation warning for indices created since 6.3.0
*/
public void testEdgeNGramDeprecationWarning() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED, VersionUtils.randomVersionBetween(random(), Version.V_6_4_0, Version.CURRENT))
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
assertWarnings(
"The [edgeNGram] token filter name is deprecated and will be removed in a future version. "
+ "Please change the filter name to [edge_ngram] instead.");
}
}

/**
* Check that the deprecated name "edgeNGram" does NOT issues a deprecation warning for indices created before 6.4.0
*/
public void testEdgeNGramNoDeprecationWarningPre6_4() throws IOException {
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir())
.put(IndexMetaData.SETTING_VERSION_CREATED,
VersionUtils.randomVersionBetween(random(), Version.V_6_0_0, Version.V_6_3_0))
.build();

IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
try (CommonAnalysisPlugin commonAnalysisPlugin = new CommonAnalysisPlugin()) {
Map<String, TokenFilterFactory> tokenFilters = createTestAnalysis(idxSettings, settings, commonAnalysisPlugin).tokenFilter;
TokenFilterFactory tokenFilterFactory = tokenFilters.get("edgeNGram");
Tokenizer tokenizer = new MockTokenizer();
tokenizer.setReader(new StringReader("foo bar"));
assertNotNull(tokenFilterFactory.create(tokenizer));
}
}


/**
* Check that the deprecated analyzer name "standard_html_strip" throws exception for indices created since 7.0.0
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ public void testNgramHighlightingWithBrokenPositions() throws IOException {
.put("analysis.tokenizer.autocomplete.max_gram", 20)
.put("analysis.tokenizer.autocomplete.min_gram", 1)
.put("analysis.tokenizer.autocomplete.token_chars", "letter,digit")
.put("analysis.tokenizer.autocomplete.type", "nGram")
.put("analysis.tokenizer.autocomplete.type", "ngram")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is about the tokenizer, not the filter, but I think we also shouldn't use the camel case version of that one anymore.

.put("analysis.filter.wordDelimiter.type", "word_delimiter")
.putList("analysis.filter.wordDelimiter.type_table",
"& => ALPHANUM", "| => ALPHANUM", "! => ALPHANUM",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,38 +23,6 @@
- match: { detail.tokenizer.tokens.0.token: Foo Bar! }

---
"nGram":
- do:
indices.analyze:
body:
text: good
explain: true
tokenizer:
type: nGram
min_gram: 2
max_gram: 2
- length: { detail.tokenizer.tokens: 3 }
- match: { detail.tokenizer.name: _anonymous_tokenizer }
- match: { detail.tokenizer.tokens.0.token: go }
- match: { detail.tokenizer.tokens.1.token: oo }
- match: { detail.tokenizer.tokens.2.token: od }

---
"nGram_exception":
- skip:
version: " - 6.99.99"
reason: only starting from version 7.x this throws an error
- do:
catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./
indices.analyze:
body:
text: good
explain: true
tokenizer:
type: nGram
min_gram: 2
max_gram: 4
---
"simple_pattern":
- do:
indices.analyze:
Expand Down Expand Up @@ -133,7 +101,7 @@
text: "foobar"
explain: true
tokenizer:
type: nGram
type: ngram
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here, this is about the tokenizer, but we shouldn't use the camel case name here regardless

min_gram: 3
max_gram: 3
- length: { detail.tokenizer.tokens: 4 }
Expand Down Expand Up @@ -162,15 +130,31 @@
body:
text: "foo"
explain: true
tokenizer: nGram
tokenizer: ngram
- length: { detail.tokenizer.tokens: 5 }
- match: { detail.tokenizer.name: nGram }
- match: { detail.tokenizer.name: ngram }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }
- match: { detail.tokenizer.tokens.2.token: o }
- match: { detail.tokenizer.tokens.3.token: oo }
- match: { detail.tokenizer.tokens.4.token: o }

---
"ngram_exception":
- skip:
version: " - 6.99.99"
reason: only starting from version 7.x this throws an error
- do:
catch: /The difference between max_gram and min_gram in NGram Tokenizer must be less than or equal to[:] \[1\] but was \[2\]\. This limit can be set by changing the \[index.max_ngram_diff\] index level setting\./
indices.analyze:
body:
text: good
explain: true
tokenizer:
type: ngram
min_gram: 2
max_gram: 4

---
"edge_ngram":
- do:
Expand All @@ -194,7 +178,7 @@
text: "foo"
explain: true
tokenizer:
type: edgeNGram
type: edge_ngram
min_gram: 1
max_gram: 3
- length: { detail.tokenizer.tokens: 3 }
Expand All @@ -219,9 +203,9 @@
body:
text: "foo"
explain: true
tokenizer: edgeNGram
tokenizer: edge_ngram
- length: { detail.tokenizer.tokens: 2 }
- match: { detail.tokenizer.name: edgeNGram }
- match: { detail.tokenizer.name: edge_ngram }
- match: { detail.tokenizer.tokens.0.token: f }
- match: { detail.tokenizer.tokens.1.token: fo }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
analysis:
tokenizer:
trigram:
type: nGram
type: ngram
min_gram: 3
max_gram: 3
filter:
Expand Down