Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure TokenFilters only produce single tokens when parsing synonyms #34331

Merged
merged 21 commits into from
Nov 29, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
a10b310
Ensure TokenFilters only produce single tokens when used for parsing …
romseygeek Oct 5, 2018
b5ca5fe
Merge remote-tracking branch 'origin/master' into synonymfilters
romseygeek Oct 15, 2018
075f2e3
Allow a specific set of filters to be used to parse synonyms; throw e…
romseygeek Oct 17, 2018
3bf6a1f
checkstyle
romseygeek Oct 17, 2018
dd90dbd
Merge remote-tracking branch 'origin/master' into synonymfilters
romseygeek Oct 24, 2018
7a444ce
Depend on leniency to determine whether or not to include filter for …
romseygeek Oct 24, 2018
9225478
Use to choose whether or not to apply filters
romseygeek Oct 24, 2018
61391a4
checkstyle
romseygeek Oct 24, 2018
087419c
feedback
romseygeek Oct 29, 2018
83b63d5
feedback
romseygeek Oct 29, 2018
906f747
Merge remote-tracking branch 'origin/master' into synonymfilters
romseygeek Nov 6, 2018
b135bd7
Merge remote-tracking branch 'origin/master' into synonymfilters
romseygeek Nov 16, 2018
3f99aca
Remove lenient option; add phonetic filter suppression
romseygeek Nov 16, 2018
bf0273a
Merge remote-tracking branch 'origin/master' into synonymfilters
romseygeek Nov 16, 2018
ac17329
Allow multiple synonym filters - WIP needs tests
romseygeek Nov 16, 2018
b482d7e
Merge remote-tracking branch 'origin/master' into synonymfilters
romseygeek Nov 19, 2018
97e96d9
checkstyle
romseygeek Nov 19, 2018
6e0ecb8
Allow chained synonym filters
romseygeek Nov 19, 2018
bf78b6a
Merge remote-tracking branch 'origin/master' into synonymfilters
romseygeek Nov 20, 2018
20acfcd
Multiplexer only returns IDENTITY if preserve_original=true
romseygeek Nov 28, 2018
4cb6a30
Merge remote-tracking branch 'origin/master' into synonymfilters
romseygeek Nov 28, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public TokenStream create(TokenStream tokenStream) {
}

@Override
public Object getMultiTermComponent() {
public TokenFilterFactory getSynonymFilter() {
if (preserveOriginal == false) {
return this;
} else {
Expand All @@ -68,4 +68,9 @@ public TokenStream create(TokenStream tokenStream) {
};
}
}

@Override
public Object getMultiTermComponent() {
return getSynonymFilter();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.TokenFilterFactory;

/**
* Contains the common configuration settings between subclasses of this class.
Expand All @@ -50,4 +51,9 @@ protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, En
throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
}
}

@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER; // don't decompound synonym file
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;

import java.util.Arrays;
import java.util.HashSet;
Expand Down Expand Up @@ -89,4 +90,11 @@ public TokenStream create(TokenStream tokenStream) {
return filter;
}

@Override
public TokenFilterFactory getSynonymFilter() {
if (outputUnigrams) {
return IDENTITY_FILTER; // don't combine for synonyms
romseygeek marked this conversation as resolved.
Show resolved Hide resolved
}
return this;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.TokenFilterFactory;

public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {

Expand Down Expand Up @@ -58,5 +59,10 @@ public TokenStream create(TokenStream tokenStream) {
return filter;
}
}

@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER;
romseygeek marked this conversation as resolved.
Show resolved Hide resolved
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;


public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
Expand Down Expand Up @@ -77,4 +78,9 @@ public TokenStream create(TokenStream tokenStream) {
public boolean breaksFastVectorHighlighter() {
return true;
}

@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER; // don't apply to synonyms
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;

import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.Version;

import org.elasticsearch.index.analysis.TokenFilterFactory;


public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
Expand Down Expand Up @@ -60,4 +60,9 @@ public TokenStream create(TokenStream tokenStream) {
// TODO: Expose preserveOriginal
return new NGramTokenFilter(tokenStream, minGram, maxGram, false);
}

@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,15 @@ public String name() {
public TokenStream create(TokenStream tokenStream) {
return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
}

@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER; // Don't apply synonyms to a synonym file, this will just confuse things
romseygeek marked this conversation as resolved.
Show resolved Hide resolved
}
};
}

Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
static Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
List<TokenFilterFactory> tokenFilters) {
return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
tokenFilters.stream()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.TokenFilterFactory;

import java.util.List;
import java.util.Set;
Expand Down Expand Up @@ -95,6 +96,11 @@ public TokenStream create(TokenStream tokenStream) {
return new WordDelimiterGraphFilter(tokenStream, charTypeTable, flags, protoWords);
}

@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER;
}

private int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
if (settings.getAsBoolean(key, defaultValue)) {
return flag;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.analysis.TokenFilterFactory;

import java.util.Collection;
import java.util.List;
Expand Down Expand Up @@ -103,6 +104,11 @@ public TokenStream create(TokenStream tokenStream) {
protoWords);
}

@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER;
}

public int getFlag(int flag, Settings settings, String key, boolean defaultValue) {
if (settings.getAsBoolean(key, defaultValue)) {
return flag;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,21 @@

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.synonym.SynonymGraphFilterFactory;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.IndexAnalyzers;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.index.analysis.TokenizerFactory;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.test.ESTestCase;
import org.elasticsearch.test.IndexSettingsModule;
import org.hamcrest.MatcherAssert;
Expand All @@ -37,6 +44,9 @@
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
Expand Down Expand Up @@ -139,6 +149,72 @@ public void testSynonymsWithMultiplexer() throws IOException {
new int[]{ 1, 1, 0, 0, 1, 1 });
}

public void testAsciiFoldingFilterForSynonyms() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms.type", "synonym")
.putList("index.analysis.filter.synonyms.synonyms", "hoj, height")
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "asciifolding", "synonyms")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;

BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "høj",
new String[]{ "hoj", "height" },
new int[]{ 1, 0 });
}

public void testKeywordRepeatAndSynonyms() throws IOException {
Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.put("index.analysis.filter.synonyms.type", "synonym")
.putList("index.analysis.filter.synonyms.synonyms", "programmer, developer")
.put("index.analysis.filter.my_english.type", "stemmer")
.put("index.analysis.filter.my_english.language", "porter2")
.put("index.analysis.analyzer.synonymAnalyzer.tokenizer", "standard")
.putList("index.analysis.analyzer.synonymAnalyzer.filter", "lowercase", "keyword_repeat", "my_english", "synonyms")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);
indexAnalyzers = createTestAnalysis(idxSettings, settings, new CommonAnalysisPlugin()).indexAnalyzers;

BaseTokenStreamTestCase.assertAnalyzesTo(indexAnalyzers.get("synonymAnalyzer"), "programmers",
new String[]{ "programmers", "programm", "develop" },
new int[]{ 1, 0, 0 });
}

public void testTokenFiltersBypassSynonymAnalysis() throws IOException {

Settings settings = Settings.builder()
.put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
.put("path.home", createTempDir().toString())
.putList("common_words", "a", "b")
.putList("word_list", "a")
.put("hyphenation_patterns_path", "foo")
.build();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", settings);

String[] bypassingFactories = new String[]{
"common_grams", "dictionary_decompounder",
"edge_ngram", "ngram", "word_delimiter", "word_delimiter_graph"
};

CommonAnalysisPlugin plugin = new CommonAnalysisPlugin();
for (String factory : bypassingFactories) {
TokenFilterFactory tff = plugin.getTokenFilters().get(factory).get(idxSettings, null, factory, settings);
TokenizerFactory tok = new KeywordTokenizerFactory(idxSettings, null, "keyword", settings);
Analyzer analyzer = SynonymTokenFilterFactory.buildSynonymAnalyzer(tok,
Collections.emptyList(), Collections.singletonList(tff));

try (TokenStream ts = analyzer.tokenStream("field", "text")) {
assertThat(ts, instanceOf(KeywordTokenizer.class));
}
}

}

private void match(String analyzerName, String source, String target) throws IOException {
Analyzer analyzer = indexAnalyzers.get(analyzerName).analyzer();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ public PhoneticTokenFilterFactory(IndexSettings indexSettings, Environment envir
}
}

@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER;
}

@Override
public TokenStream create(TokenStream tokenStream) {
if (encoder == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,26 @@ public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisCompone
*/
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
Function<TokenStream, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream));
}

/**
* Create a pre-configured token filter that may not vary at all.
*/
public static PreConfiguredTokenFilter singleton(String name, boolean useFilterForMultitermQueries,
boolean useFilterForParsingSynonyms,
Function<TokenStream, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, useFilterForParsingSynonyms, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream));
}

/**
* Create a pre-configured token filter that may not vary at all.
*/
public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ONE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ONE,
(tokenStream, version) -> create.apply(tokenStream, version));
}

Expand All @@ -55,7 +65,7 @@ public static PreConfiguredTokenFilter singletonWithVersion(String name, boolean
*/
public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.apache.lucene.util.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.LUCENE,
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.LUCENE,
(tokenStream, version) -> create.apply(tokenStream, version.luceneVersion));
}

Expand All @@ -64,16 +74,18 @@ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFil
*/
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create);
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, false, CachingStrategy.ELASTICSEARCH, create);
}

private final boolean useFilterForMultitermQueries;
private final boolean useFilterForParsingSynonyms;
private final BiFunction<TokenStream, Version, TokenStream> create;

private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries, boolean useFilterForParsingSynonyms,
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
super(name, cache);
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
this.useFilterForParsingSynonyms = useFilterForParsingSynonyms;
this.create = create;
}

Expand Down Expand Up @@ -104,6 +116,14 @@ public TokenStream create(TokenStream tokenStream) {
public Object getMultiTermComponent() {
return this;
}

@Override
public TokenFilterFactory getSynonymFilter() {
if (useFilterForParsingSynonyms) {
return this;
}
return IDENTITY_FILTER;
}
};
}
return new TokenFilterFactory() {
Expand All @@ -116,6 +136,14 @@ public String name() {
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}

@Override
public TokenFilterFactory getSynonymFilter() {
if (useFilterForParsingSynonyms) {
return this;
}
return IDENTITY_FILTER;
}
};
}
}