Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for char filters in the analyze API #5148

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 6 additions & 2 deletions docs/reference/indices/analyze.asciidoc
Expand Up @@ -12,12 +12,16 @@ analyzers:
curl -XGET 'localhost:9200/_analyze?analyzer=standard' -d 'this is a test'
--------------------------------------------------

Or by building a custom transient analyzer out of tokenizers and
filters:
Or by building a custom transient analyzer out of tokenizers,
token filters and char filters. Token filters can use the shorter 'filters'
parameter name:

[source,js]
--------------------------------------------------
curl -XGET 'localhost:9200/_analyze?tokenizer=keyword&filters=lowercase' -d 'this is a test'

curl -XGET 'localhost:9200/_analyze?tokenizer=keyword&token_filters=lowercase&char_filters=html_strip' -d 'this is a <b>test</b>'

--------------------------------------------------

It can also run against a specific index:
Expand Down
Expand Up @@ -18,9 +18,11 @@
*/
package org.elasticsearch.action.admin.indices.analyze;

import org.elasticsearch.Version;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.support.single.custom.SingleCustomOperationRequest;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;

Expand All @@ -42,7 +44,9 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>

private String tokenizer;

private String[] tokenFilters;
private String[] tokenFilters = Strings.EMPTY_ARRAY;

private String[] charFilters = Strings.EMPTY_ARRAY;

private String field;

Expand Down Expand Up @@ -110,6 +114,15 @@ public String[] tokenFilters() {
return this.tokenFilters;
}

public AnalyzeRequest charFilters(String... charFilters) {
this.charFilters = charFilters;
return this;
}

public String[] charFilters() {
return this.charFilters;
}

public AnalyzeRequest field(String field) {
this.field = field;
return this;
Expand All @@ -125,6 +138,12 @@ public ActionRequestValidationException validate() {
if (text == null) {
validationException = addValidationError("text is missing", validationException);
}
if (tokenFilters == null) {
validationException = addValidationError("token filters must not be null", validationException);
}
if (charFilters == null) {
validationException = addValidationError("char filters must not be null", validationException);
}
return validationException;
}

Expand All @@ -135,12 +154,9 @@ public void readFrom(StreamInput in) throws IOException {
text = in.readString();
analyzer = in.readOptionalString();
tokenizer = in.readOptionalString();
int size = in.readVInt();
if (size > 0) {
tokenFilters = new String[size];
for (int i = 0; i < size; i++) {
tokenFilters[i] = in.readString();
}
tokenFilters = in.readStringArray();
if (in.getVersion().onOrAfter(Version.V_1_1_0)) {
charFilters = in.readStringArray();
}
field = in.readOptionalString();
}
Expand All @@ -152,13 +168,9 @@ public void writeTo(StreamOutput out) throws IOException {
out.writeString(text);
out.writeOptionalString(analyzer);
out.writeOptionalString(tokenizer);
if (tokenFilters == null) {
out.writeVInt(0);
} else {
out.writeVInt(tokenFilters.length);
for (String tokenFilter : tokenFilters) {
out.writeString(tokenFilter);
}
out.writeStringArray(tokenFilters);
if (out.getVersion().onOrAfter(Version.V_1_1_0)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see above we have a writeStringArray as well

out.writeStringArray(charFilters);
}
out.writeOptionalString(field);
}
Expand Down
Expand Up @@ -81,6 +81,14 @@ public AnalyzeRequestBuilder setTokenFilters(String... tokenFilters) {
return this;
}

/**
* Sets char filters that will be used before the tokenizer.
*/
public AnalyzeRequestBuilder setCharFilters(String... charFilters) {
request.charFilters(charFilters);
return this;
}

@Override
protected void doExecute(ActionListener<AnalyzeResponse> listener) {
((IndicesAdminClient) client).analyze(request, listener);
Expand Down
Expand Up @@ -162,6 +162,7 @@ protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) th
throw new ElasticsearchIllegalArgumentException("failed to find tokenizer under [" + request.tokenizer() + "]");
}
}

TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
if (request.tokenFilters() != null && request.tokenFilters().length > 0) {
tokenFilterFactories = new TokenFilterFactory[request.tokenFilters().length];
Expand All @@ -170,21 +171,45 @@ protected AnalyzeResponse shardOperation(AnalyzeRequest request, int shardId) th
if (indexService == null) {
TokenFilterFactoryFactory tokenFilterFactoryFactory = indicesAnalysisService.tokenFilterFactoryFactory(tokenFilterName);
if (tokenFilterFactoryFactory == null) {
throw new ElasticsearchIllegalArgumentException("failed to find global token filter under [" + request.tokenizer() + "]");
throw new ElasticsearchIllegalArgumentException("failed to find global token filter under [" + tokenFilterName + "]");
}
tokenFilterFactories[i] = tokenFilterFactoryFactory.create(tokenFilterName, ImmutableSettings.Builder.EMPTY_SETTINGS);
} else {
tokenFilterFactories[i] = indexService.analysisService().tokenFilter(tokenFilterName);
if (tokenFilterFactories[i] == null) {
throw new ElasticsearchIllegalArgumentException("failed to find token filter under [" + request.tokenizer() + "]");
throw new ElasticsearchIllegalArgumentException("failed to find token filter under [" + tokenFilterName + "]");
}
}
if (tokenFilterFactories[i] == null) {
throw new ElasticsearchIllegalArgumentException("failed to find token filter under [" + request.tokenizer() + "]");
throw new ElasticsearchIllegalArgumentException("failed to find token filter under [" + tokenFilterName + "]");
}
}
}
analyzer = new CustomAnalyzer(tokenizerFactory, new CharFilterFactory[0], tokenFilterFactories);

CharFilterFactory[] charFilterFactories = new CharFilterFactory[0];
if (request.charFilters() != null && request.charFilters().length > 0) {
charFilterFactories = new CharFilterFactory[request.charFilters().length];
for (int i = 0; i < request.charFilters().length; i++) {
String charFilterName = request.charFilters()[i];
if (indexService == null) {
CharFilterFactoryFactory charFilterFactoryFactory = indicesAnalysisService.charFilterFactoryFactory(charFilterName);
if (charFilterFactoryFactory == null) {
throw new ElasticsearchIllegalArgumentException("failed to find global char filter under [" + charFilterName + "]");
}
charFilterFactories[i] = charFilterFactoryFactory.create(charFilterName, ImmutableSettings.Builder.EMPTY_SETTINGS);
} else {
charFilterFactories[i] = indexService.analysisService().charFilter(charFilterName);
if (charFilterFactories[i] == null) {
throw new ElasticsearchIllegalArgumentException("failed to find token char under [" + charFilterName + "]");
}
}
if (charFilterFactories[i] == null) {
throw new ElasticsearchIllegalArgumentException("failed to find token char under [" + charFilterName + "]");
}
}
}

analyzer = new CustomAnalyzer(tokenizerFactory, charFilterFactories, tokenFilterFactories);
closeAnalyzer = true;
} else if (analyzer == null) {
if (indexService == null) {
Expand Down
Expand Up @@ -70,7 +70,8 @@ public void handleRequest(final RestRequest request, final RestChannel channel)
analyzeRequest.analyzer(request.param("analyzer"));
analyzeRequest.field(request.param("field"));
analyzeRequest.tokenizer(request.param("tokenizer"));
analyzeRequest.tokenFilters(request.paramAsStringArray("token_filters", request.paramAsStringArray("filters", null)));
analyzeRequest.tokenFilters(request.paramAsStringArray("token_filters", request.paramAsStringArray("filters", analyzeRequest.tokenFilters())));
analyzeRequest.charFilters(request.paramAsStringArray("char_filters", analyzeRequest.charFilters()));
client.admin().indices().analyze(analyzeRequest, new ActionListener<AnalyzeResponse>() {
@Override
public void onResponse(AnalyzeResponse response) {
Expand Down
Expand Up @@ -23,12 +23,27 @@
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequestBuilder;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.elasticsearch.test.ElasticsearchIntegrationTest;
import org.junit.Test;

import java.io.IOException;

import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
import static org.hamcrest.Matchers.equalTo;

/**
Expand Down Expand Up @@ -106,6 +121,50 @@ public void analyzeWithNoIndex() throws Exception {
analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("keyword").setTokenFilters("lowercase").execute().actionGet();
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));

analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A TEST").setTokenizer("standard").setTokenFilters("lowercase", "reverse").execute().actionGet();
assertThat(analyzeResponse.getTokens().size(), equalTo(4));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
assertThat(token.getTerm(), equalTo("siht"));
token = analyzeResponse.getTokens().get(1);
assertThat(token.getTerm(), equalTo("si"));
token = analyzeResponse.getTokens().get(2);
assertThat(token.getTerm(), equalTo("a"));
token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("tset"));
}

@Test
public void analyzeWithCharFilters() throws Exception {

ImmutableSettings.Builder settings = settingsBuilder()
.put("index.analysis.char_filter.custom_mapping.type", "mapping")
.putArray("index.analysis.char_filter.custom_mapping.mappings", "ph=>f", "qu=>q")
.put("index.analysis.analyzer.custom_with_char_filter.tokenizer", "standard")
.putArray("index.analysis.analyzer.custom_with_char_filter.char_filter", "custom_mapping");

prepareCreate("test", 1, settings).execute().actionGet();
client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID).setWaitForGreenStatus().execute().actionGet();

AnalyzeResponse analyzeResponse = client().admin().indices().prepareAnalyze("<h2><b>THIS</b> IS A</h2> <a href=\"#\">TEST</a>").setTokenizer("standard").setCharFilters("html_strip").execute().actionGet();
assertThat(analyzeResponse.getTokens().size(), equalTo(4));

analyzeResponse = client().admin().indices().prepareAnalyze("THIS IS A <b>TEST</b>").setTokenizer("keyword").setTokenFilters("lowercase").setCharFilters("html_strip").execute().actionGet();
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("this is a test"));

analyzeResponse = client().admin().indices().prepareAnalyze("test", "jeff quit phish").setTokenizer("keyword").setTokenFilters("lowercase").setCharFilters("custom_mapping").execute().actionGet();
assertThat(analyzeResponse.getTokens().size(), equalTo(1));
assertThat(analyzeResponse.getTokens().get(0).getTerm(), equalTo("jeff qit fish"));

analyzeResponse = client().admin().indices().prepareAnalyze("test", "<a href=\"#\">jeff quit fish</a>").setTokenizer("standard").setCharFilters("html_strip", "custom_mapping").execute().actionGet();
assertThat(analyzeResponse.getTokens().size(), equalTo(3));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(0);
assertThat(token.getTerm(), equalTo("jeff"));
token = analyzeResponse.getTokens().get(1);
assertThat(token.getTerm(), equalTo("qit"));
token = analyzeResponse.getTokens().get(2);
assertThat(token.getTerm(), equalTo("fish"));
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we could also test the case where we have a custom char filter registered under a specific index?

@Test
Expand Down