Skip to content

Commit

Permalink
Adds pattern keyword marker filter support (#23600)
Browse files Browse the repository at this point in the history
This commit adds support for the pattern keyword marker filter in
Lucene.  Previously, the keyword marker filter in Elasticsearch
supported specifying a keywords set or a path to a set of keywords.
This commit exposes the regular expression pattern based keyword marker
filter also available in Lucene, so that any token matching the pattern
specified by the `keywords_pattern` setting is excluded from being
stemmed by any stemming filters.

Closes #4877
  • Loading branch information
Ali Beyad committed Mar 28, 2017
1 parent dc777a0 commit 4ecb1ba
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,30 +21,68 @@

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;

import java.util.Set;
import java.util.regex.Pattern;

/**
* A factory for creating keyword marker token filters that prevent tokens from
* being modified by stemmers. Two types of keyword marker filters are available:
* the {@link SetKeywordMarkerFilter} and the {@link PatternKeywordMarkerFilter}.
*
* The {@link SetKeywordMarkerFilter} uses a set of keywords to denote which tokens
* should be excluded from stemming. This filter is created if the settings include
* {@code keywords}, which contains the list of keywords, or {@code `keywords_path`},
* which contains a path to a file in the config directory with the keywords.
*
* The {@link PatternKeywordMarkerFilter} uses a regular expression pattern to match
* against tokens that should be excluded from stemming. This filter is created if
* the settings include {@code keywords_pattern}, which contains the regular expression
* to match against.
*/
public class KeywordMarkerTokenFilterFactory extends AbstractTokenFilterFactory {

private final CharArraySet keywordLookup;
private final Pattern keywordPattern;

public KeywordMarkerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, name, settings);

boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
if (rules == null) {
throw new IllegalArgumentException("keyword filter requires either `keywords` or `keywords_path` to be configured");
String patternString = settings.get("keywords_pattern");
if (patternString != null) {
// a pattern for matching keywords is specified, as opposed to a
// set of keyword strings to match against
if (settings.get("keywords") != null || settings.get("keywords_path") != null) {
throw new IllegalArgumentException(
"cannot specify both `keywords_pattern` and `keywords` or `keywords_path`");
}
keywordPattern = Pattern.compile(patternString);
keywordLookup = null;
} else {
Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
if (rules == null) {
throw new IllegalArgumentException(
"keyword filter requires either `keywords`, `keywords_path`, " +
"or `keywords_pattern` to be configured");
}
// a set of keywords (or a path to them) is specified
keywordLookup = new CharArraySet(rules, ignoreCase);
keywordPattern = null;
}
keywordLookup = new CharArraySet(rules, ignoreCase);
}

@Override
public TokenStream create(TokenStream tokenStream) {
return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
if (keywordPattern != null) {
return new PatternKeywordMarkerFilter(tokenStream, keywordPattern);
} else {
return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter;
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.test.ESTestCase.TestAnalysis;
import org.elasticsearch.test.ESTokenStreamTestCase;

import java.io.IOException;

import static org.hamcrest.Matchers.instanceOf;

/**
* Tests for the {@link KeywordMarkerTokenFilterFactory} class.
*/
public class KeywordMarkerFilterFactoryTests extends ESTokenStreamTestCase {

/**
* Tests using a keyword set for the keyword marker filter.
*/
public void testKeywordSet() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
.put("index.analysis.filter.my_keyword.keywords", "running, sleeping")
.put("index.analysis.analyzer.my_keyword.type", "custom")
.put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keyword");
assertThat(tokenFilter, instanceOf(KeywordMarkerTokenFilterFactory.class));
TokenStream filter = tokenFilter.create(new WhitespaceTokenizer());
assertThat(filter, instanceOf(SetKeywordMarkerFilter.class));
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_keyword");
// jogging is not part of the keywords set, so verify that its the only stemmed word
assertAnalyzesTo(analyzer, "running jogging sleeping",
new String[] { "running", "jog", "sleeping" });
}

/**
* Tests using a regular expression pattern for the keyword marker filter.
*/
public void testKeywordPattern() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
.put("index.analysis.filter.my_keyword.keywords_pattern", "run[a-z]ing")
.put("index.analysis.analyzer.my_keyword.type", "custom")
.put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_keyword");
assertThat(tokenFilter, instanceOf(KeywordMarkerTokenFilterFactory.class));
TokenStream filter = tokenFilter.create(new WhitespaceTokenizer());
assertThat(filter, instanceOf(PatternKeywordMarkerFilter.class));
NamedAnalyzer analyzer = analysis.indexAnalyzers.get("my_keyword");
// running should match the pattern, so it should not be stemmed but sleeping should
assertAnalyzesTo(analyzer, "running sleeping", new String[] { "running", "sleep" });
}

/**
* Verifies that both keywords and patterns cannot be specified together.
*/
public void testCannotSpecifyBothKeywordsAndPattern() throws IOException {
Settings settings = Settings.builder()
.put("index.analysis.filter.my_keyword.type", "keyword_marker")
.put("index.analysis.filter.my_keyword.keywords", "running")
.put("index.analysis.filter.my_keyword.keywords_pattern", "run[a-z]ing")
.put("index.analysis.analyzer.my_keyword.type", "custom")
.put("index.analysis.analyzer.my_keyword.tokenizer", "standard")
.put("index.analysis.analyzer.my_keyword.filter", "my_keyword, porter_stem")
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
IllegalArgumentException e = expectThrows(IllegalArgumentException.class,
() -> AnalysisTestsHelper.createTestAnalysisFromSettings(settings));
assertEquals("cannot specify both `keywords_pattern` and `keywords` or `keywords_path`",
e.getMessage());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ any stemming filters.
|`keywords_path` |A path (either relative to `config` location, or
absolute) to a list of words.

|`keywords_pattern` |A regular expression pattern to match against words
in the text.

|`ignore_case` |Set to `true` to lower case all words first. Defaults to
`false`.
|=======================================================================
Expand Down

0 comments on commit 4ecb1ba

Please sign in to comment.