Skip to content

Commit

Permalink
Search - add case insensitive flag for "term" family of queries #61596 (
Browse files Browse the repository at this point in the history
#62661)

Backport of fe9145f

Closes #61546
  • Loading branch information
markharwood committed Sep 22, 2020
1 parent 0d5250c commit a0df0fb
Show file tree
Hide file tree
Showing 47 changed files with 883 additions and 137 deletions.
4 changes: 4 additions & 0 deletions docs/reference/query-dsl/prefix-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ provided `<field>`.
(Optional, string) Method used to rewrite the query. For valid values and more
information, see the <<query-dsl-multi-term-rewrite, `rewrite` parameter>>.

`case_insensitive`::
(Optional, boolean) allows ASCII case insensitive matching of the
value with the indexed field values when set to true. Setting to false is disallowed.

[[prefix-query-notes]]
==== Notes

Expand Down
8 changes: 6 additions & 2 deletions docs/reference/query-dsl/term-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ Boost values are relative to the default value of `1.0`. A boost value between
`0` and `1.0` decreases the relevance score. A value greater than `1.0`
increases the relevance score.

`case_insensitive`::
(Optional, boolean) allows ASCII case insensitive matching of the
value with the indexed field values when set to true. Setting to false is disallowed.

[[term-query-notes]]
==== Notes

Expand All @@ -84,7 +88,7 @@ The `term` query does *not* analyze the search term. The `term` query only
searches for the *exact* term you provide. This means the `term` query may
return poor or no results when searching `text` fields.

To see the difference in search results, try the following example.
To see the difference in search results, try the following example.

. Create an index with a `text` field called `full_text`.
+
Expand Down Expand Up @@ -214,4 +218,4 @@ in the results.
}
----
// TESTRESPONSE[s/"took" : 1/"took" : $body.took/]
--
--
6 changes: 5 additions & 1 deletion docs/reference/query-dsl/wildcard-query.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ This parameter supports two wildcard operators:

WARNING: Avoid beginning patterns with `*` or `?`. This can increase
the iterations needed to find matching terms and slow search performance.
--
--

`boost`::
(Optional, float) Floating point number used to decrease or increase the
Expand All @@ -69,6 +69,10 @@ increases the relevance score.
(Optional, string) Method used to rewrite the query. For valid values and more information, see the
<<query-dsl-multi-term-rewrite, `rewrite` parameter>>.

`case_insensitive`::
(Optional, boolean) allows case insensitive matching of the
pattern with the indexed field values when set to true. Setting to false is disallowed.

[[wildcard-query-notes]]
==== Notes
===== Allow expensive queries
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,11 +281,11 @@ public Query existsQuery(QueryShardContext context) {
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
if (prefixField == null || prefixField.termLengthWithinBounds(value.length()) == false) {
return super.prefixQuery(value, method, context);
return super.prefixQuery(value, method, caseInsensitive, context);
} else {
final Query query = prefixField.prefixQuery(value, method, context);
final Query query = prefixField.prefixQuery(value, method, caseInsensitive, context);
if (method == null
|| method == MultiTermQuery.CONSTANT_SCORE_REWRITE
|| method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
Expand Down Expand Up @@ -365,8 +365,11 @@ boolean termLengthWithinBounds(int length) {
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
if (value.length() >= minChars) {
if(caseInsensitive) {
return super.termQueryCaseInsensitive(value, context);
}
return super.termQuery(value, context);
}
List<Automaton> automata = new ArrayList<>();
Expand Down Expand Up @@ -507,11 +510,11 @@ public Query existsQuery(QueryShardContext context) {
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
if (prefixFieldType == null || prefixFieldType.termLengthWithinBounds(value.length()) == false) {
return super.prefixQuery(value, method, context);
return super.prefixQuery(value, method, caseInsensitive, context);
} else {
final Query query = prefixFieldType.prefixQuery(value, method, context);
final Query query = prefixFieldType.prefixQuery(value, method, caseInsensitive, context);
if (method == null
|| method == MultiTermQuery.CONSTANT_SCORE_REWRITE
|| method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,15 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
}

@Override
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method,
boolean caseInsensitive, QueryShardContext context) {
throw new UnsupportedOperationException("[prefix] queries are not supported on [" + CONTENT_TYPE + "] fields.");
}

@Override
public Query wildcardQuery(String value,
@Nullable MultiTermQuery.RewriteMethod method,
boolean caseInsensitive,
QueryShardContext context) {
throw new UnsupportedOperationException("[wildcard] queries are not supported on [" + CONTENT_TYPE + "] fields.");
}
Expand Down
14 changes: 14 additions & 0 deletions server/src/main/java/org/elasticsearch/common/Strings.java
Original file line number Diff line number Diff line change
Expand Up @@ -879,4 +879,18 @@ public static String padStart(String s, int minimumLength, char c) {
return sb.toString();
}
}

public static String toLowercaseAscii(String in) {
StringBuilder out = new StringBuilder();
Iterator<Integer> iter = in.codePoints().iterator();
while (iter.hasNext()) {
int codepoint = iter.next();
if (codepoint > 128) {
out.appendCodePoint(codepoint);
} else {
out.appendCodePoint(Character.toLowerCase(codepoint));
}
}
return out.toString();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.common.lucene.search;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.MinimizationOperations;
import org.apache.lucene.util.automaton.Operations;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
* Helper functions for creating various forms of {@link AutomatonQuery}
*/
public class AutomatonQueries {



/** Build an automaton query accepting all terms with the specified prefix, ASCII case insensitive. */
public static Automaton caseInsensitivePrefix(String s) {
List<Automaton> list = new ArrayList<>();
Iterator<Integer> iter = s.codePoints().iterator();
while (iter.hasNext()) {
list.add(toCaseInsensitiveChar(iter.next(), Integer.MAX_VALUE));
}
list.add(Automata.makeAnyString());

Automaton a = Operations.concatenate(list);
a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
return a;
}


/** Build an automaton query accepting all terms with the specified prefix, ASCII case insensitive. */
public static AutomatonQuery caseInsensitivePrefixQuery(Term prefix) {
return new AutomatonQuery(prefix, caseInsensitivePrefix(prefix.text()));
}

/** Build an automaton accepting all terms ASCII case insensitive. */
public static AutomatonQuery caseInsensitiveTermQuery(Term term) {
BytesRef prefix = term.bytes();
return new AutomatonQuery(term, toCaseInsensitiveString(prefix,Integer.MAX_VALUE));
}


/** Build an automaton matching a wildcard pattern, ASCII case insensitive. */
public static AutomatonQuery caseInsensitiveWildcardQuery(Term wildcardquery) {
return new AutomatonQuery(wildcardquery, toCaseInsensitiveWildcardAutomaton(wildcardquery,Integer.MAX_VALUE));
}


/** String equality with support for wildcards */
public static final char WILDCARD_STRING = '*';

/** Char equality with support for wildcards */
public static final char WILDCARD_CHAR = '?';

/** Escape character */
public static final char WILDCARD_ESCAPE = '\\';
/**
* Convert Lucene wildcard syntax into an automaton.
*/
@SuppressWarnings("fallthrough")
public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery, int maxDeterminizedStates) {
List<Automaton> automata = new ArrayList<>();

String wildcardText = wildcardquery.text();

for (int i = 0; i < wildcardText.length();) {
final int c = wildcardText.codePointAt(i);
int length = Character.charCount(c);
switch(c) {
case WILDCARD_STRING:
automata.add(Automata.makeAnyString());
break;
case WILDCARD_CHAR:
automata.add(Automata.makeAnyChar());
break;
case WILDCARD_ESCAPE:
// add the next codepoint instead, if it exists
if (i + length < wildcardText.length()) {
final int nextChar = wildcardText.codePointAt(i + length);
length += Character.charCount(nextChar);
automata.add(Automata.makeChar(nextChar));
break;
} // else fallthru, lenient parsing with a trailing \
default:
automata.add(toCaseInsensitiveChar(c, maxDeterminizedStates));
}
i += length;
}

return Operations.concatenate(automata);
}

protected static Automaton toCaseInsensitiveString(BytesRef br, int maxDeterminizedStates) {
return toCaseInsensitiveString(br.utf8ToString(), maxDeterminizedStates);
}

public static Automaton toCaseInsensitiveString(String s, int maxDeterminizedStates) {
List<Automaton> list = new ArrayList<>();
Iterator<Integer> iter = s.codePoints().iterator();
while (iter.hasNext()) {
list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
}

Automaton a = Operations.concatenate(list);
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
return a;


}

protected static Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
Automaton case1 = Automata.makeChar(codepoint);
// For now we only work with ASCII characters
if (codepoint > 128) {
return case1;
}
int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
Automaton result;
if (altCase != codepoint) {
result = Operations.union(case1, Automata.makeChar(altCase));
result = MinimizationOperations.minimize(result, maxDeterminizedStates);
} else {
result = case1;
}
return result;
}
}
34 changes: 29 additions & 5 deletions server/src/main/java/org/elasticsearch/common/regex/Regex.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,15 +79,39 @@ public static Automaton simpleMatchToAutomaton(String... patterns) {
* Match a String against the given pattern, supporting the following simple
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
* arbitrary number of pattern parts), as well as direct equality.
* Matching is case sensitive.
*
* @param pattern the pattern to match against
* @param str the String to match
* @return whether the String matches the given pattern
*/
public static boolean simpleMatch(String pattern, String str) {
return simpleMatch(pattern, str, false);
}


/**
* Match a String against the given pattern, supporting the following simple
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
* arbitrary number of pattern parts), as well as direct equality.
*
* @param pattern the pattern to match against
* @param str the String to match
* @param caseInsensitive true if ASCII case differences should be ignored
* @return whether the String matches the given pattern
*/
public static boolean simpleMatch(String pattern, String str, boolean caseInsensitive) {
if (pattern == null || str == null) {
return false;
}
if (caseInsensitive) {
pattern = Strings.toLowercaseAscii(pattern);
str = Strings.toLowercaseAscii(str);
}
return simpleMatchWithNormalizedStrings(pattern, str);
}

private static boolean simpleMatchWithNormalizedStrings(String pattern, String str) {
final int firstIndex = pattern.indexOf('*');
if (firstIndex == -1) {
return pattern.equals(str);
Expand All @@ -102,12 +126,12 @@ public static boolean simpleMatch(String pattern, String str) {
return str.regionMatches(str.length() - pattern.length() + 1, pattern, 1, pattern.length() - 1);
} else if (nextIndex == 1) {
// Double wildcard "**" - skipping the first "*"
return simpleMatch(pattern.substring(1), str);
return simpleMatchWithNormalizedStrings(pattern.substring(1), str);
}
final String part = pattern.substring(1, nextIndex);
int partIndex = str.indexOf(part);
while (partIndex != -1) {
if (simpleMatch(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
if (simpleMatchWithNormalizedStrings(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
return true;
}
partIndex = str.indexOf(part, partIndex + 1);
Expand All @@ -116,9 +140,9 @@ public static boolean simpleMatch(String pattern, String str) {
}
return str.regionMatches(0, pattern, 0, firstIndex)
&& (firstIndex == pattern.length() - 1 // only wildcard in pattern is at the end, so no need to look at the rest of the string
|| simpleMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
}

|| simpleMatchWithNormalizedStrings(pattern.substring(firstIndex), str.substring(firstIndex)));
}
/**
* Match a String against the given patterns, supporting the following simple
* pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
Expand Down

0 comments on commit a0df0fb

Please sign in to comment.