Adds pattern keyword marker filter support (#23600)

This commit adds support for the pattern keyword marker filter in Lucene. Previously, the keyword marker filter in Elasticsearch supported specifying a keywords set or a path to a set of keywords. This commit exposes the regular expression pattern based keyword marker filter also available in Lucene, so that any token matching the pattern specified by the `keywords_pattern` setting is excluded from being stemmed by any stemming filters. Closes #4877
elastic · Mar 28, 2017 · 4ecb1ba · 4ecb1ba
1 parent dc777a0
commit 4ecb1ba
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 5 deletions.
diff --git a/core/src/main/java/org/elasticsearch/index/analysis/KeywordMarkerTokenFilterFactory.java b/core/src/main/java/org/elasticsearch/index/analysis/KeywordMarkerTokenFilterFactory.java
@@ -21,30 +21,68 @@
 
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
 import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 
 import java.util.Set;
+import java.util.regex.Pattern;
 
+/**
+ * A factory for creating keyword marker token filters that prevent tokens from
+ * being modified by stemmers.  Two types of keyword marker filters are available:
+ * the {@link SetKeywordMarkerFilter} and the {@link PatternKeywordMarkerFilter}.
+ *
+ * The {@link SetKeywordMarkerFilter} uses a set of keywords to denote which tokens
+ * should be excluded from stemming.  This filter is created if the settings include
+ * {@code keywords}, which contains the list of keywords, or {@code `keywords_path`},
+ * which contains a path to a file in the config directory with the keywords.
+ *
+ * The {@link PatternKeywordMarkerFilter} uses a regular expression pattern to match
+ * against tokens that should be excluded from stemming.  This filter is created if
+ * the settings include {@code keywords_pattern}, which contains the regular expression
+ * to match against.
+ */
 public class KeywordMarkerTokenFilterFactory extends AbstractTokenFilterFactory {
 
     private final CharArraySet keywordLookup;
+    private final Pattern keywordPattern;
 
     public KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
         super(indexSettings, name, settings);
 
         boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
-        Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
-        if (rules == null) {
-            throw new IllegalArgumentException("keyword filter requires either `keywords` or `keywords_path` to be configured");
+        String patternString = settings.get("keywords_pattern");
+        if (patternString != null) {
+            // a pattern for matching keywords is specified, as opposed to a
+            // set of keyword strings to match against
+            if (settings.get("keywords") != null || settings.get("keywords_path") != null) {
+                throw new IllegalArgumentException(
+                    "cannot specify both `keywords_pattern` and `keywords` or `keywords_path`");
+            }
+            keywordPattern = Pattern.compile(patternString);
+            keywordLookup = null;
+        } else {
+            Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
+            if (rules == null) {
+                throw new IllegalArgumentException(
+                    "keyword filter requires either `keywords`, `keywords_path`, " +
+                    "or `keywords_pattern` to be configured");
+            }
+            // a set of keywords (or a path to them) is specified
+            keywordLookup = new CharArraySet(rules, ignoreCase);
+            keywordPattern = null;
         }
-        keywordLookup = new CharArraySet(rules, ignoreCase);
     }
 
     @Override
     public TokenStream create(TokenStream tokenStream) {
-        return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
+        if (keywordPattern != null) {
+            return new PatternKeywordMarkerFilter(tokenStream, keywordPattern);
+        } else {
+            return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
+        }
     }
 }
diff --git a/core/src/test/java/org/elasticsearch/index/analysis/KeywordMarkerFilterFactoryTests.java b/core/src/test/java/org/elasticsearch/index/analysis/KeywordMarkerFilterFactoryTests.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
+import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.test.ESTestCase.TestAnalysis;
+import org.elasticsearch.test.ESTokenStreamTestCase;
+
+import java.io.IOException;
+
+import static org.hamcrest.Matchers.instanceOf;
+
+/**
+ * Tests for the {@link KeywordMarkerTokenFilterFactory} class.
+ */
+public class KeywordMarkerFilterFactoryTests extends ESTokenStreamTestCase {
+
+    /**
+     * Tests using a keyword set for the keyword marker filter.
+     */
+    public void testKeywordSet() throws IOException {
+        Settings settings = Settings.builder()
+            .put("index.analysis.filter.my_keyword.type", "keyword_marker")
+            .put("index.analysis.filter.my_keyword.keywords", "running, sleeping")
+            .put("index.analysis.analyzer.my_keyword.type", "custom")
+            .put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
+            .put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keyword");
+        assertThat(tokenFilter, instanceOf(KeywordMarkerTokenFilterFactory.class));
+        TokenStream filter = tokenFilter.create(new WhitespaceTokenizer());
+        assertThat(filter, instanceOf(SetKeywordMarkerFilter.class));
+        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_keyword");
+        // jogging is not part of the keywords set, so verify that its the only stemmed word
+        assertAnalyzesTo(analyzer, "running jogging sleeping",
+            new String[] { "running", "jog", "sleeping" });
+    }
+
+    /**
+     * Tests using a regular expression pattern for the keyword marker filter.
+     */
+    public void testKeywordPattern() throws IOException {
+        Settings settings = Settings.builder()
+            .put("index.analysis.filter.my_keyword.type", "keyword_marker")
+            .put("index.analysis.filter.my_keyword.keywords_pattern", "run[a-z]ing")
+            .put("index.analysis.analyzer.my_keyword.type", "custom")
+            .put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
+            .put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
+        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keyword");
+        assertThat(tokenFilter, instanceOf(KeywordMarkerTokenFilterFactory.class));
+        TokenStream filter = tokenFilter.create(new WhitespaceTokenizer());
+        assertThat(filter, instanceOf(PatternKeywordMarkerFilter.class));
+        NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_keyword");
+        // running should match the pattern, so it should not be stemmed but sleeping should
+        assertAnalyzesTo(analyzer, "running sleeping", new String[] { "running", "sleep" });
+    }
+
+    /**
+     * Verifies that both keywords and patterns cannot be specified together.
+     */
+    public void testCannotSpecifyBothKeywordsAndPattern() throws IOException {
+        Settings settings = Settings.builder()
+            .put("index.analysis.filter.my_keyword.type", "keyword_marker")
+            .put("index.analysis.filter.my_keyword.keywords", "running")
+            .put("index.analysis.filter.my_keyword.keywords_pattern", "run[a-z]ing")
+            .put("index.analysis.analyzer.my_keyword.type", "custom")
+            .put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
+            .put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
+            .put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
+            .build();
+        IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
+            () -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
+        assertEquals("cannot specify both `keywords_pattern` and `keywords` or `keywords_path`",
+            e.getMessage());
+    }
+}
diff --git a/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/keyword-marker-tokenfilter.asciidoc
@@ -12,6 +12,9 @@ any stemming filters.
 |`keywords_path` |A path (either relative to `config` location, or
 absolute) to a list of words.
 
+|`keywords_pattern` |A regular expression pattern to match against words
+in the text.
+
 |`ignore_case` |Set to `true` to lower case all words first. Defaults to
 `false`.
 |=======================================================================