elastic · romseygeek · Nov 29, 2018 · Oct 5, 2018 · Oct 15, 2018 · Oct 17, 2018
diff --git a/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc
@@ -175,3 +175,16 @@ PUT /test_index
 
 Using `synonyms_path` to define WordNet synonyms in a file is supported
 as well.
+
+=== Parsing synonym files
+
+Elasticsearch will use the token filters preceding the synonym filter
+in a tokenizer chain to parse the entries in a synonym file.  So, for example, if a
+synonym filter is placed after a stemmer, then the stemmer will also be applied
+to the synonym entries.  Because entries in the synonym map cannot have stacked
+positions, some token filters may cause issues here.  Token filters that produce
+multiple versions of a token may choose which version of the token to emit when
+parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
+token.  Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
+error, unless `lenient` has been set to `true`, in which case a best-effort attempt
+at applying the filter will be used.
diff --git a/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc
@@ -163,3 +163,17 @@ PUT /test_index
 
 Using `synonyms_path` to define WordNet synonyms in a file is supported
 as well.
+
+
+=== Parsing synonym files
+
+Elasticsearch will use the token filters preceding the synonym filter
+in a tokenizer chain to parse the entries in a synonym file.  So, for example, if a
+synonym filter is placed after a stemmer, then the stemmer will also be applied
+to the synonym entries.  Because entries in the synonym map cannot have stacked
+positions, some token filters may cause issues here.  Token filters that produce
+multiple versions of a token may choose which version of the token to emit when
+parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
+token.  Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
+error, unless `lenient` has been set to `true`, in which case a best-effort attempt
+at applying the filter will be used.
diff --git a/...ommon/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java b/...ommon/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java
@@ -51,7 +51,7 @@ public TokenStream create(TokenStream tokenStream) {
     }
 
     @Override
-    public Object getMultiTermComponent() {
+    public TokenFilterFactory getSynonymFilter(boolean lenient) {
         if (preserveOriginal == false) {
             return this;
         } else {
@@ -68,4 +68,9 @@ public TokenStream create(TokenStream tokenStream) {
             };
         }
     }
+
+    @Override
+    public Object getMultiTermComponent() {
+        return getSynonymFilter(true);
+    }
 }
diff --git a/...c/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java b/...c/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java
@@ -26,6 +26,7 @@
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.Analysis;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 /**
  * Contains the common configuration settings between subclasses of this class.
@@ -50,4 +51,9 @@ protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, En
             throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
         }
     }
+
+    @Override
+    public TokenFilterFactory getSynonymFilter(boolean lenient) {
+        return IDENTITY_FILTER;     // don't decompound synonym file
+    }
 }
diff --git a/...alysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java b/...alysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java
@@ -26,6 +26,7 @@
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 import java.util.Arrays;
 import java.util.HashSet;
@@ -89,4 +90,11 @@ public TokenStream create(TokenStream tokenStream) {
         return filter;
     }
 
+    @Override
+    public TokenFilterFactory getSynonymFilter(boolean lenient) {
+        if (outputUnigrams) {
+            return IDENTITY_FILTER;     // don't combine for synonyms
+        }
+        return this;
+    }
 }
diff --git a/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java b/...analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java
@@ -427,7 +427,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
         filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
-        filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
                 new LengthFilter(input, 0, Integer.MAX_VALUE)));  // TODO this one seems useless

diff --git a/...common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java b/...common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java
@@ -28,6 +28,7 @@
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.Analysis;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
 
@@ -58,5 +59,10 @@ public TokenStream create(TokenStream tokenStream) {
             return filter;
         }
     }
+
+    @Override
+    public TokenFilterFactory getSynonymFilter(boolean lenient) {
+        return IDENTITY_FILTER;
+    }
 }
 
diff --git a/...s-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java b/...s-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java
@@ -19,17 +19,24 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+import org.elasticsearch.Version;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 
 public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(EdgeNGramTokenFilterFactory.class));
+
     private final int minGram;
 
     private final int maxGram;
@@ -77,4 +84,20 @@ public TokenStream create(TokenStream tokenStream) {
     public boolean breaksFastVectorHighlighter() {
         return true;
     }
+
+    @Override
+    public TokenFilterFactory getSynonymFilter(boolean lenient) {
+        if (lenient) {
+            return IDENTITY_FILTER;
+        }
+        if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
+            throw new IllegalArgumentException("Token filter [" + name() +
+                "] cannot be used to parse synonyms unless [lenient] is set to true");
+        }
+        else {
+            DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+                + "] will not be usable to parse synonyms after v7.0 unless [lenient] is set to true");
+            return IDENTITY_FILTER;
+        }
+    }
 }
diff --git a/...common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java b/...common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java
@@ -19,18 +19,25 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
+import org.elasticsearch.Version;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
 import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;
 
 public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(FingerprintTokenFilterFactory.class));
+
     private final char separator;
     private final int maxOutputSize;
 
@@ -47,4 +54,20 @@ public TokenStream create(TokenStream tokenStream) {
         return result;
     }
 
+    @Override
+    public TokenFilterFactory getSynonymFilter(boolean lenient) {
+        if (lenient) {
+            return this;
+        }
+        if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
+            throw new IllegalArgumentException("Token filter [" + name() +
+                "] cannot be used to parse synonyms unless [lenient] is set to true");
+        }
+        else {
+            DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+                + "] will not be usable to parse synonyms after v7.0 unless [lenient] is set to true");
+            return IDENTITY_FILTER;
+        }
+    }
+
 }
diff --git a/...common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java b/...common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java
@@ -19,12 +19,15 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.elasticsearch.Version;
 import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
@@ -40,6 +43,9 @@
 
 public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(MultiplexerTokenFilterFactory.class));
+
     private List<String> filterNames;
     private final boolean preserveOriginal;
 
@@ -54,6 +60,22 @@ public TokenStream create(TokenStream tokenStream) {
         throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
     }
 
+    @Override
+    public TokenFilterFactory getSynonymFilter(boolean lenient) {
+        if (lenient) {
+            return IDENTITY_FILTER;
+        }
+        if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
+            throw new IllegalArgumentException("Token filter [" + name() +
+                "] cannot be used to parse synonyms unless [lenient] is set to true");
+        }
+        else {
+            DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+                + "] will not be usable to parse synonyms after v7.0 unless [lenient] is set to true");
+            return IDENTITY_FILTER;
+        }
+    }
+
     @Override
     public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
                                                               List<TokenFilterFactory> previousTokenFilters,
@@ -97,8 +119,19 @@ public TokenStream create(TokenStream tokenStream) {
             }
 
             @Override
-            public TokenFilterFactory getSynonymFilter() {
-                return IDENTITY_FILTER;
+            public TokenFilterFactory getSynonymFilter(boolean lenient) {
+                if (lenient) {
+                    return IDENTITY_FILTER;
+                }
+                if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
+                    throw new IllegalArgumentException("Token filter [" + name() +
+                        "] cannot be used to parse synonyms unless [lenient] is set to true");
+                }
+                else {
+                    DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+                        + "] will not be usable to parse synonyms after v7.0 unless [lenient] is set to true");
+                    return IDENTITY_FILTER;
+                }
             }
         };
     }

diff --git a/...lysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java b/...lysis-common/src/main/java/org/elasticsearch/analysis/common/NGramTokenFilterFactory.java
@@ -19,23 +19,27 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.Version;
-
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 
 public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(NGramTokenFilterFactory.class));
+
     private final int minGram;
 
     private final int maxGram;
 
-
     NGramTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
         super(indexSettings, name, settings);
         int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff();
@@ -60,4 +64,20 @@ public TokenStream create(TokenStream tokenStream) {
         // TODO: Expose preserveOriginal
         return new NGramTokenFilter(tokenStream, minGram, maxGram, false);
     }
+
+    @Override
+    public TokenFilterFactory getSynonymFilter(boolean lenient) {
+        if (lenient) {
+            return IDENTITY_FILTER;
+        }
+        if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_7_0_0_alpha1)) {
+            throw new IllegalArgumentException("Token filter [" + name() +
+                "] cannot be used to parse synonyms unless [lenient] is set to true");
+        }
+        else {
+            DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+                + "] will not be usable to parse synonyms after v7.0 unless [lenient] is set to true");
+            return IDENTITY_FILTER;
+        }
+    }
 }
diff --git a/...ommon/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java b/...ommon/src/main/java/org/elasticsearch/analysis/common/SynonymGraphTokenFilterFactory.java
@@ -49,7 +49,7 @@ public TokenStream create(TokenStream tokenStream) {
     public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
                                                               List<TokenFilterFactory> previousTokenFilters,
                                                               Function<String, TokenFilterFactory> allFilters) {
-        final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
+        final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
         final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
         final String name = name();
         return new TokenFilterFactory() {

diff --git a/...sis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java b/...sis-common/src/main/java/org/elasticsearch/analysis/common/SynonymTokenFilterFactory.java
@@ -72,7 +72,7 @@ public TokenStream create(TokenStream tokenStream) {
     public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
                                                               List<TokenFilterFactory> previousTokenFilters,
                                                               Function<String, TokenFilterFactory> allFilters) {
-        final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters);
+        final Analyzer analyzer = buildSynonymAnalyzer(tokenizer, charFilters, previousTokenFilters, allFilters);
         final SynonymMap synonyms = buildSynonyms(analyzer, getRulesFromSettings(environment));
         final String name = name();
         return new TokenFilterFactory() {
@@ -85,14 +85,19 @@ public String name() {
             public TokenStream create(TokenStream tokenStream) {
                 return synonyms.fst == null ? tokenStream : new SynonymFilter(tokenStream, synonyms, false);
             }
+
+            @Override
+            public TokenFilterFactory getSynonymFilter(boolean lenient) {
+                return IDENTITY_FILTER;     // Don't apply synonyms to a synonym file, this will just confuse things
+            }
         };
     }
 
     Analyzer buildSynonymAnalyzer(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
-                                  List<TokenFilterFactory> tokenFilters) {
+                                  List<TokenFilterFactory> tokenFilters, Function<String, TokenFilterFactory> allFilters) {
         return new CustomAnalyzer("synonyms", tokenizer, charFilters.toArray(new CharFilterFactory[0]),
             tokenFilters.stream()
-                .map(TokenFilterFactory::getSynonymFilter)
+                .map(ts -> ts.getSynonymFilter(lenient))
                 .toArray(TokenFilterFactory[]::new));
     }