From 6a60f85bba15d1edca4452d18be251e247ae312f Mon Sep 17 00:00:00 2001 From: markharwood Date: Tue, 24 Mar 2020 17:37:47 +0000 Subject: [PATCH] Wildcard field - add normalizer support (#53851) (#54109) Backport support for normalisation to wildcard field Closes #53603 --- .../reference/mapping/types/wildcard.asciidoc | 17 +++ .../index/mapper/MappedFieldType.java | 4 +- .../index/mapper/StringFieldType.java | 57 +++++----- .../index/query/QueryBuilders.java | 2 +- .../index/query/PrefixQueryBuilderTests.java | 2 +- .../query/QueryStringQueryBuilderTests.java | 2 +- .../test/wildcard/10_wildcard_basic.yml | 62 +++++++++-- .../wildcard/mapper/WildcardFieldMapper.java | 101 +++++++++++++++++- 8 files changed, 204 insertions(+), 43 deletions(-) diff --git a/docs/reference/mapping/types/wildcard.asciidoc b/docs/reference/mapping/types/wildcard.asciidoc index fae5b90805b93..54f1eae2d3c39 100644 --- a/docs/reference/mapping/types/wildcard.asciidoc +++ b/docs/reference/mapping/types/wildcard.asciidoc @@ -50,6 +50,23 @@ POST my_index/_doc/_search -------------------------------------------------- +[[wildcard-params]] +==== Parameters for wildcard fields + +The following parameters are accepted by `wildcard` fields: + +[horizontal] + +<>:: + + Do not index any string longer than this value. Defaults to `2147483647` + so that all values would be accepted. + +<>:: + + How to pre-process the value prior to indexing. Defaults to `null`, + meaning the value is kept as-is. + ==== Limitations * `wildcard` fields are untokenized like keyword fields, so do not support queries that rely on word positions such as phrase queries. diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java index a87302a6af498..fcaaac99511e3 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java @@ -358,14 +358,14 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int } public Query prefixQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) { - throw new QueryShardException(context, "Can only use prefix queries on keyword and text fields - not on [" + name + throw new QueryShardException(context, "Can only use prefix queries on keyword, text and wildcard fields - not on [" + name + "] which is of type [" + typeName() + "]"); } public Query wildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, QueryShardContext context) { - throw new QueryShardException(context, "Can only use wildcard queries on keyword and text fields - not on [" + name + throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name + "] which is of type [" + typeName() + "]"); } diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index 05bf6b61d1de1..37c20b236ed56 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -19,6 +19,7 @@ package org.elasticsearch.index.mapper; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.MultiTermQuery; @@ -93,6 +94,36 @@ public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, Quer return query; } + public static final String normalizeWildcardPattern(String fieldname, String value, Analyzer normalizer) { + if (normalizer == null) { + return value; + } + // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there + // is a char_filter that would otherwise remove them + Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value); + BytesRefBuilder sb = new BytesRefBuilder(); + int last = 0; + + while (wildcardMatcher.find()) { + if (wildcardMatcher.start() > 0) { + String chunk = value.substring(last, wildcardMatcher.start()); + + BytesRef normalized = normalizer.normalize(fieldname, chunk); + sb.append(normalized); + } + // append the matched group - without normalizing + sb.append(new BytesRef(wildcardMatcher.group())); + + last = wildcardMatcher.end(); + } + if (last < value.length()) { + String chunk = value.substring(last); + BytesRef normalized = normalizer.normalize(fieldname, chunk); + sb.append(normalized); + } + return sb.toBytesRef().utf8ToString(); + } + @Override public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) { failIfNotIndexed(); @@ -103,30 +134,8 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, Qu Term term; if (searchAnalyzer() != null) { - // we want to normalize everything except wildcard characters, e.g. F?o Ba* to f?o ba*, even if e.g there - // is a char_filter that would otherwise remove them - Matcher wildcardMatcher = WILDCARD_PATTERN.matcher(value); - BytesRefBuilder sb = new BytesRefBuilder(); - int last = 0; - - while (wildcardMatcher.find()) { - if (wildcardMatcher.start() > 0) { - String chunk = value.substring(last, wildcardMatcher.start()); - - BytesRef normalized = searchAnalyzer().normalize(name(), chunk); - sb.append(normalized); - } - // append the matched group - without normalizing - sb.append(new BytesRef(wildcardMatcher.group())); - - last = wildcardMatcher.end(); - } - if (last < value.length()) { - String chunk = value.substring(last); - BytesRef normalized = searchAnalyzer().normalize(name(), chunk); - sb.append(normalized); - } - term = new Term(name(), sb.toBytesRef()); + value = normalizeWildcardPattern(name(), value, searchAnalyzer()); + term = new Term(name(), value); } else { term = new Term(name(), indexedValueForSearch(value)); } diff --git a/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java b/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java index f43c172161865..95d2f38023026 100644 --- a/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java +++ b/server/src/main/java/org/elasticsearch/index/query/QueryBuilders.java @@ -264,7 +264,7 @@ public static RangeQueryBuilder rangeQuery(String name) { * which matches any single character. Note this query can be slow, as it * needs to iterate over many terms. In order to prevent extremely slow WildcardQueries, * a Wildcard term should not start with one of the wildcards {@code *} or - * {@code ?}. + * {@code ?}. (The wildcard field type however, is optimised for leading wildcards) * * @param name The field name * @param query The wildcard query string diff --git a/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java index 94596ffd6c58d..4253eaded3056 100644 --- a/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/PrefixQueryBuilderTests.java @@ -116,7 +116,7 @@ public void testNumeric() throws Exception { QueryShardContext context = createShardContext(); QueryShardException e = expectThrows(QueryShardException.class, () -> query.toQuery(context)); - assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]", + assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]", e.getMessage()); } diff --git a/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java index a84a82a3cc49b..3f10c64d446b4 100644 --- a/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/QueryStringQueryBuilderTests.java @@ -816,7 +816,7 @@ public void testPrefixNumeric() throws Exception { QueryShardContext context = createShardContext(); QueryShardException e = expectThrows(QueryShardException.class, () -> query.toQuery(context)); - assertEquals("Can only use prefix queries on keyword and text fields - not on [mapped_int] which is of type [integer]", + assertEquals("Can only use prefix queries on keyword, text and wildcard fields - not on [mapped_int] which is of type [integer]", e.getMessage()); query.lenient(true); query.toQuery(context); // no exception diff --git a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml index bdec75fc54837..486082ef3ab59 100644 --- a/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml +++ b/x-pack/plugin/src/test/resources/rest-api-spec/test/wildcard/10_wildcard_basic.yml @@ -10,10 +10,20 @@ setup: body: settings: number_of_replicas: 0 + analysis: + normalizer: + lowercase: + type: custom + char_filter: [] + filter: ["lowercase"] mappings: properties: my_wildcard: type: wildcard + normalizer: lowercase + fields: + case_sensitive: + type: wildcard - do: index: index: test-index @@ -26,6 +36,12 @@ setup: id: 2 body: my_wildcard: goodbye world + - do: + index: + index: test-index + id: 3 + body: + my_wildcard: cAsE iNsEnSiTiVe World - do: indices.refresh: {} @@ -80,6 +96,31 @@ setup: my_wildcard: {value: "*ello worl*" } + - match: {hits.total.value: 1} +--- +"Case insensitive query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard: {value: "*Worl*" } + + + - match: {hits.total.value: 3} + +--- +"Case sensitive query": + - do: + search: + body: + track_total_hits: true + query: + wildcard: + my_wildcard.case_sensitive: {value: "*Worl*" } + + - match: {hits.total.value: 1} --- @@ -93,7 +134,7 @@ setup: my_wildcard: {value: "*ld" } - - match: {hits.total.value: 2} + - match: {hits.total.value: 3} --- "Long suffix query": @@ -188,8 +229,8 @@ setup: terms: {field: "my_wildcard" } - - match: {hits.total.value: 2} - - length: { aggregations.top_vals.buckets: 2 } + - match: {hits.total.value: 3} + - length: { aggregations.top_vals.buckets: 3 } --- "Sort works": @@ -199,10 +240,11 @@ setup: track_total_hits: true sort: [ { "my_wildcard": "desc" } ] - - match: { hits.total.value: 2 } - - length: { hits.hits: 2 } + - match: { hits.total.value: 3 } + - length: { hits.hits: 3 } - match: { hits.hits.0._id: "1" } - match: { hits.hits.1._id: "2" } + - match: { hits.hits.2._id: "3" } - do: search: @@ -210,9 +252,9 @@ setup: track_total_hits: true sort: [ { "my_wildcard": "asc" } ] - - match: { hits.total.value: 2 } - - length: { hits.hits: 2 } - - match: { hits.hits.0._id: "2" } - - match: { hits.hits.1._id: "1" } - + - match: { hits.total.value: 3 } + - length: { hits.hits: 3 } + - match: { hits.hits.0._id: "3" } + - match: { hits.hits.1._id: "2" } + - match: { hits.hits.2._id: "1" } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index e489d8a35bb9f..568e4f502dbe0 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -39,6 +39,7 @@ import org.elasticsearch.index.Index; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AnalyzerScope; +import org.elasticsearch.index.analysis.IndexAnalyzers; import org.elasticsearch.index.analysis.NamedAnalyzer; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested; @@ -53,6 +54,7 @@ import org.elasticsearch.index.mapper.MapperService; import org.elasticsearch.index.mapper.ParseContext; import org.elasticsearch.index.mapper.ParseContext.Document; +import org.elasticsearch.index.mapper.StringFieldType; import org.elasticsearch.index.query.QueryShardContext; import org.elasticsearch.index.similarity.SimilarityProvider; import org.elasticsearch.indices.breaker.CircuitBreakerService; @@ -64,6 +66,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import static org.elasticsearch.index.mapper.TypeParsers.parseField; @@ -100,6 +103,9 @@ public static class Defaults { public static class Builder extends FieldMapper.Builder { protected int ignoreAbove = Defaults.IGNORE_ABOVE; + private IndexAnalyzers indexAnalyzers; + private String normalizerName; + public Builder(String name) { super(name, Defaults.FIELD_TYPE, Defaults.FIELD_TYPE); @@ -164,10 +170,23 @@ protected void setupFieldType(BuilderContext context) { public WildcardFieldType fieldType() { return (WildcardFieldType) super.fieldType(); } + + public Builder normalizer(IndexAnalyzers indexAnalyzers, String name) { + this.indexAnalyzers = indexAnalyzers; + this.normalizerName = name; + return builder; + } @Override public WildcardFieldMapper build(BuilderContext context) { - setupFieldType(context); + setupFieldType(context); + if (normalizerName != null) { + NamedAnalyzer normalizer = indexAnalyzers.getNormalizer(normalizerName); + if (normalizer == null) { + throw new MapperParsingException("normalizer [" + normalizerName + "] not found for field [" + name + "]"); + } + fieldType().setNormalizer(normalizer); + } return new WildcardFieldMapper( name, fieldType, defaultFieldType, ignoreAbove, context.indexSettings(), multiFieldsBuilder.build(this, context), copyTo); @@ -188,6 +207,11 @@ public static class TypeParser implements Mapper.TypeParser { if (propName.equals("ignore_above")) { builder.ignoreAbove(XContentMapValues.nodeIntegerValue(propNode, -1)); iterator.remove(); + } else if (propName.equals("normalizer")) { + if (propNode != null) { + builder.normalizer(parserContext.getIndexAnalyzers(), propNode.toString()); + } + iterator.remove(); } } @@ -198,6 +222,8 @@ public static class TypeParser implements Mapper.TypeParser { public static final char TOKEN_START_OR_END_CHAR = 0; public static final class WildcardFieldType extends MappedFieldType { + + private NamedAnalyzer normalizer = null; public WildcardFieldType() { setIndexAnalyzer(Lucene.KEYWORD_ANALYZER); @@ -206,6 +232,7 @@ public WildcardFieldType() { protected WildcardFieldType(WildcardFieldType ref) { super(ref); + this.normalizer = ref.normalizer; } public WildcardFieldType clone() { @@ -213,7 +240,39 @@ public WildcardFieldType clone() { return result; } - + + @Override + public boolean equals(Object o) { + if (super.equals(o) == false) { + return false; + } + WildcardFieldType other = (WildcardFieldType) o; + return Objects.equals(normalizer, other.normalizer); + } + + @Override + public int hashCode() { + return 31 * super.hashCode() + Objects.hash(normalizer); + } + + private NamedAnalyzer normalizer() { + return normalizer; + } + + public void setNormalizer(NamedAnalyzer normalizer) { + checkIfFrozen(); + this.normalizer = normalizer; + } + + @Override + public void checkCompatibility(MappedFieldType otherFT, List conflicts) { + super.checkCompatibility(otherFT, conflicts); + WildcardFieldType other = (WildcardFieldType) otherFT; + if (Objects.equals(normalizer, other.normalizer) == false) { + conflicts.add("mapper [" + name() + "] has different [normalizer]"); + } + } + // Holds parsed information about the wildcard pattern static class PatternStructure { boolean openStart, openEnd, hasSymbols; @@ -327,6 +386,9 @@ public boolean equals(Object obj) { @Override public Query wildcardQuery(String wildcardPattern, RewriteMethod method, QueryShardContext context) { + if (normalizer != null) { + wildcardPattern = StringFieldType.normalizeWildcardPattern(name(), wildcardPattern, normalizer); + } PatternStructure patternStructure = new PatternStructure(wildcardPattern); ArrayList tokens = new ArrayList<>(); @@ -467,7 +529,32 @@ public IndexFieldData build(IndexSettings indexSettings, MappedFieldType fiel CircuitBreakerService breakerService, MapperService mapperService) { return new WildcardBytesBinaryDVIndexFieldData(indexSettings.getIndex(), fieldType.name()); }}; - } + } + + + String normalize(String value) throws IOException { + if (normalizer != null) { + try (TokenStream ts = normalizer.tokenStream(name(), value)) { + final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + if (ts.incrementToken() == false) { + throw new IllegalStateException("The normalization token stream is " + + "expected to produce exactly 1 token, but got 0 for analyzer " + + normalizer + " and input \"" + value + "\""); + } + final String newValue = termAtt.toString(); + if (ts.incrementToken()) { + throw new IllegalStateException("The normalization token stream is " + + "expected to produce exactly 1 token, but got 2+ for analyzer " + + normalizer + " and input \"" + value + "\""); + } + ts.end(); + return newValue; + } + } + return value; + } + } static class WildcardBytesBinaryDVIndexFieldData extends BytesBinaryDVIndexFieldData{ @@ -521,6 +608,11 @@ protected void doXContentBody(XContentBuilder builder, boolean includeDefaults, if (includeDefaults || ignoreAbove != Defaults.IGNORE_ABOVE) { builder.field("ignore_above", ignoreAbove); } + if (fieldType().normalizer() != null) { + builder.field("normalizer", fieldType().normalizer().name()); + } else if (includeDefaults) { + builder.nullField("normalizer"); + } } @Override @@ -544,10 +636,11 @@ protected void parseCreateField(ParseContext context, List field // For internal use by Lucene only - used to define ngram index final MappedFieldType ngramFieldType; - void createFields(String value, Document parseDoc, Listfields) { + void createFields(String value, Document parseDoc, Listfields) throws IOException { if (value == null || value.length() > ignoreAbove) { return; } + value = fieldType().normalize(value); String ngramValue = TOKEN_START_OR_END_CHAR + value + TOKEN_START_OR_END_CHAR + TOKEN_START_OR_END_CHAR; Field ngramField = new Field(fieldType().name(), ngramValue, ngramFieldType); fields.add(ngramField);