Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiplexing token filter #31208

Merged
merged 16 commits into from
Jun 20, 2018
2 changes: 2 additions & 0 deletions docs/reference/analysis/tokenfilters.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ include::tokenfilters/word-delimiter-tokenfilter.asciidoc[]

include::tokenfilters/word-delimiter-graph-tokenfilter.asciidoc[]

include::tokenfilters/multiplexer-tokenfilter.asciidoc[]

include::tokenfilters/stemmer-tokenfilter.asciidoc[]

include::tokenfilters/stemmer-override-tokenfilter.asciidoc[]
Expand Down
116 changes: 116 additions & 0 deletions docs/reference/analysis/tokenfilters/multiplexer-tokenfilter.asciidoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
[[analysis-multiplexer-tokenfilter]]
=== Multiplexer Token Filter

A token filter of type `multiplexer` will emit multiple tokens at the same position,
each version of the token having been run through a different filter. Identical
output tokens at the same position will be removed.

WARNING: If the incoming token stream has duplicate tokens, then these will also be
removed by the multiplexer

[float]
=== Options
[horizontal]
filters:: a list of token filters to apply to incoming tokens. These can be any
token filters defined elsewhere in the index mappings. Filters can be chained
using a comma-delimited string, so for example `"lowercase, porter_stem"` would
apply the `lowercase` filter and then the `porter_stem` filter to a single token.

WARNING: Shingle or multi-word synonym token filters will not function normally
when they are declared in the filters array because they read ahead internally
which is unsupported by the multiplexer

preserve_original:: if `true` (the default) then emit the original token in
addition to the filtered tokens


[float]
=== Settings example

You can set it up like:

[source,js]
--------------------------------------------------
PUT /multiplexer_example
{
"settings" : {
"analysis" : {
"analyzer" : {
"my_analyzer" : {
"tokenizer" : "standard",
"filter" : [ "my_multiplexer" ]
}
},
"filter" : {
"my_multiplexer" : {
"type" : "multiplexer",
"filters" : [ "lowercase", "lowercase, porter_stem" ]
}
}
}
}
}
--------------------------------------------------
// CONSOLE

And test it like:

[source,js]
--------------------------------------------------
POST /multiplexer_example/_analyze
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is seriously one of my favorite APIs in Elasticsearch.

{
"analyzer" : "my_analyzer",
"text" : "Going HOME"
}
--------------------------------------------------
// CONSOLE
// TEST[continued]

And it'd respond:

[source,js]
--------------------------------------------------
{
"tokens": [
{
"token": "Going",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "going",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "go",
"start_offset": 0,
"end_offset": 5,
"type": "<ALPHANUM>",
"position": 0
},
{
"token": "HOME",
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "home", <1>
"start_offset": 6,
"end_offset": 10,
"type": "<ALPHANUM>",
"position": 1
}
]
}
--------------------------------------------------
// TESTRESPONSE

<1> The stemmer has also emitted a token `home` at position 1, but because it is a
duplicate of this token it has been removed from the token stream
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
filters.put("limit", LimitTokenCountFilterFactory::new);
filters.put("lowercase", LowerCaseTokenFilterFactory::new);
filters.put("min_hash", MinHashTokenFilterFactory::new);
filters.put("multiplexer", MultiplexerTokenFilterFactory::new);
filters.put("ngram", NGramTokenFilterFactory::new);
filters.put("nGram", NGramTokenFilterFactory::new);
filters.put("pattern_capture", requriesAnalysisSettings(PatternCaptureGroupTokenFilterFactory::new));
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.analysis.common;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
import org.elasticsearch.index.analysis.ReferringFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;

public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory implements ReferringFilterFactory {

private List<TokenFilterFactory> filters;
private List<String> filterNames;
private final boolean preserveOriginal;

private static final TokenFilterFactory IDENTITY_FACTORY = new TokenFilterFactory() {
@Override
public String name() {
return "identity";
}

@Override
public TokenStream create(TokenStream tokenStream) {
return tokenStream;
}
};

public MultiplexerTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) throws IOException {
super(indexSettings, name, settings);
this.filterNames = settings.getAsList("filters");
this.preserveOriginal = settings.getAsBoolean("preserve_original", true);
}

@Override
public TokenStream create(TokenStream tokenStream) {
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
for (TokenFilterFactory tff : filters) {
functions.add(tff::create);
}
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
}

@Override
public void setReferences(Map<String, TokenFilterFactory> factories) {
filters = new ArrayList<>();
if (preserveOriginal) {
filters.add(IDENTITY_FACTORY);
}
for (String filter : filterNames) {
String[] parts = Strings.tokenizeToStringArray(filter, ",");
if (parts.length == 1) {
filters.add(resolveFilterFactory(factories, parts[0]));
} else {
List<TokenFilterFactory> chain = new ArrayList<>();
for (String subfilter : parts) {
chain.add(resolveFilterFactory(factories, subfilter));
}
filters.add(chainFilters(filter, chain));
}
}
}

private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) {
return new TokenFilterFactory() {
@Override
public String name() {
return name;
}

@Override
public TokenStream create(TokenStream tokenStream) {
for (TokenFilterFactory tff : filters) {
tokenStream = tff.create(tokenStream);
}
return tokenStream;
}
};
}

private TokenFilterFactory resolveFilterFactory(Map<String, TokenFilterFactory> factories, String name) {
if (factories.containsKey(name) == false) {
throw new IllegalArgumentException("Multiplexing filter [" + name() + "] refers to undefined tokenfilter [" + name + "]");
} else {
return factories.get(name);
}
}

private final class MultiplexTokenFilter extends TokenFilter {

private final TokenStream source;
private final int filterCount;

private int selector;

/**
* Creates a MultiplexTokenFilter on the given input with a set of filters
*/
MultiplexTokenFilter(TokenStream input, List<Function<TokenStream, TokenStream>> filters) {
super(input);
TokenStream source = new MultiplexerFilter(input);
for (int i = 0; i < filters.size(); i++) {
final int slot = i;
source = new ConditionalTokenFilter(source, filters.get(i)) {
@Override
protected boolean shouldFilter() {
return slot == selector;
}
};
}
this.source = source;
this.filterCount = filters.size();
this.selector = filterCount - 1;
}

@Override
public boolean incrementToken() throws IOException {
return source.incrementToken();
}

@Override
public void end() throws IOException {
source.end();
}

@Override
public void reset() throws IOException {
source.reset();
}

private final class MultiplexerFilter extends TokenFilter {

State state;
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

private MultiplexerFilter(TokenStream input) {
super(input);
}

@Override
public boolean incrementToken() throws IOException {
if (selector >= filterCount - 1) {
selector = 0;
if (input.incrementToken() == false) {
return false;
}
state = captureState();
return true;
}
restoreState(state);
posIncAtt.setPositionIncrement(0);
selector++;
return true;
}

@Override
public void reset() throws IOException {
super.reset();
selector = filterCount - 1;
this.state = null;
}
}

}
}
Loading