Search - add case insensitive flag for "term" family of queries #61596 (

#62661) Backport of fe9145f Closes #61546
elastic · Sep 22, 2020 · a0df0fb · a0df0fb
1 parent 0d5250c
commit a0df0fb
Show file tree

Hide file tree

Showing 47 changed files with 883 additions and 137 deletions.
diff --git a/docs/reference/query-dsl/prefix-query.asciidoc b/docs/reference/query-dsl/prefix-query.asciidoc
@@ -41,6 +41,10 @@ provided `<field>`.
 (Optional, string) Method used to rewrite the query. For valid values and more
 information, see the <<query-dsl-multi-term-rewrite, `rewrite` parameter>>.
 
+`case_insensitive`::
+(Optional, boolean) allows ASCII case insensitive matching of the
+value with the indexed field values when set to true. Setting to false is disallowed.
+
 [[prefix-query-notes]]
 ==== Notes
 

diff --git a/docs/reference/query-dsl/term-query.asciidoc b/docs/reference/query-dsl/term-query.asciidoc
@@ -62,6 +62,10 @@ Boost values are relative to the default value of `1.0`. A boost value between
 `0` and `1.0` decreases the relevance score. A value greater than `1.0`
 increases the relevance score.
 
+`case_insensitive`::
+(Optional, boolean) allows ASCII case insensitive matching of the
+value with the indexed field values when set to true. Setting to false is disallowed.
+
 [[term-query-notes]]
 ==== Notes
 
@@ -84,7 +88,7 @@ The `term` query does *not* analyze the search term. The `term` query only
 searches for the *exact* term you provide. This means the `term` query may
 return poor or no results when searching `text` fields.
 
-To see the difference in search results, try the following example.  
+To see the difference in search results, try the following example.
 
 . Create an index with a `text` field called `full_text`.
 +
@@ -214,4 +218,4 @@ in the results.
 }
 ----
 // TESTRESPONSE[s/"took" : 1/"took" : $body.took/]
---
+--
diff --git a/docs/reference/query-dsl/wildcard-query.asciidoc b/docs/reference/query-dsl/wildcard-query.asciidoc
@@ -52,7 +52,7 @@ This parameter supports two wildcard operators:
 
 WARNING: Avoid beginning patterns with `*` or `?`. This can increase
 the iterations needed to find matching terms and slow search performance.
--- 
+--
 
 `boost`::
 (Optional, float) Floating point number used to decrease or increase the
@@ -69,6 +69,10 @@ increases the relevance score.
 (Optional, string) Method used to rewrite the query. For valid values and more information, see the
 <<query-dsl-multi-term-rewrite, `rewrite` parameter>>.
 
+`case_insensitive`::
+(Optional, boolean) allows case insensitive matching of the
+pattern with the indexed field values when set to true. Setting to false is disallowed.
+
 [[wildcard-query-notes]]
 ==== Notes
 ===== Allow expensive queries

diff --git a/...apper-extras/src/main/java/org/elasticsearch/index/mapper/SearchAsYouTypeFieldMapper.java b/...apper-extras/src/main/java/org/elasticsearch/index/mapper/SearchAsYouTypeFieldMapper.java
@@ -281,11 +281,11 @@ public Query existsQuery(QueryShardContext context) {
         }
 
         @Override
-        public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
+        public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
             if (prefixField == null || prefixField.termLengthWithinBounds(value.length()) == false) {
-                return super.prefixQuery(value, method, context);
+                return super.prefixQuery(value, method, caseInsensitive, context);
             } else {
-                final Query query = prefixField.prefixQuery(value, method, context);
+                final Query query = prefixField.prefixQuery(value, method, caseInsensitive, context);
                 if (method == null
                     || method == MultiTermQuery.CONSTANT_SCORE_REWRITE
                     || method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
@@ -365,8 +365,11 @@ boolean termLengthWithinBounds(int length) {
         }
 
         @Override
-        public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
+        public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
             if (value.length() >= minChars) {
+                if(caseInsensitive) {
+                    return super.termQueryCaseInsensitive(value, context);
+                }
                 return super.termQuery(value, context);
             }
             List<Automaton> automata = new ArrayList<>();
@@ -507,11 +510,11 @@ public Query existsQuery(QueryShardContext context) {
         }
 
         @Override
-        public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
+        public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, QueryShardContext context) {
             if (prefixFieldType == null || prefixFieldType.termLengthWithinBounds(value.length()) == false) {
-                return super.prefixQuery(value, method, context);
+                return super.prefixQuery(value, method, caseInsensitive, context);
             } else {
-                final Query query = prefixFieldType.prefixQuery(value, method, context);
+                final Query query = prefixFieldType.prefixQuery(value, method, caseInsensitive, context);
                 if (method == null
                     || method == MultiTermQuery.CONSTANT_SCORE_REWRITE
                     || method == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {

diff --git a/...ysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java b/...ysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java
@@ -136,13 +136,15 @@ public Query fuzzyQuery(Object value, Fuzziness fuzziness, int prefixLength, int
         }
 
         @Override
-        public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, QueryShardContext context) {
+        public Query prefixQuery(String value, MultiTermQuery.RewriteMethod method, 
+            boolean caseInsensitive, QueryShardContext context) {
             throw new UnsupportedOperationException("[prefix] queries are not supported on [" + CONTENT_TYPE + "] fields.");
         }
 
         @Override
         public Query wildcardQuery(String value,
                                    @Nullable MultiTermQuery.RewriteMethod method,
+                                   boolean caseInsensitive,
                                    QueryShardContext context) {
             throw new UnsupportedOperationException("[wildcard] queries are not supported on [" + CONTENT_TYPE + "] fields.");
         }

diff --git a/server/src/main/java/org/elasticsearch/common/Strings.java b/server/src/main/java/org/elasticsearch/common/Strings.java
@@ -879,4 +879,18 @@ public static String padStart(String s, int minimumLength, char c) {
             return sb.toString();
         }
     }
+
+    public static String toLowercaseAscii(String in) {
+        StringBuilder out = new StringBuilder();
+        Iterator<Integer> iter = in.codePoints().iterator();
+        while (iter.hasNext()) {
+            int codepoint = iter.next();
+            if (codepoint > 128) {
+                out.appendCodePoint(codepoint);
+            } else {
+                out.appendCodePoint(Character.toLowerCase(codepoint));
+            }
+        }
+        return out.toString();
+    }    
 }
diff --git a/server/src/main/java/org/elasticsearch/common/lucene/search/AutomatonQueries.java b/server/src/main/java/org/elasticsearch/common/lucene/search/AutomatonQueries.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.common.lucene.search;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.AutomatonQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.Automaton;
+import org.apache.lucene.util.automaton.MinimizationOperations;
+import org.apache.lucene.util.automaton.Operations;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Helper functions for creating various forms of {@link AutomatonQuery}
+ */
+public class AutomatonQueries  {
+
+
+
+    /** Build an automaton query accepting all terms with the specified prefix, ASCII case insensitive. */
+    public static Automaton caseInsensitivePrefix(String s) {
+        List<Automaton> list = new ArrayList<>();
+        Iterator<Integer> iter = s.codePoints().iterator();
+        while (iter.hasNext()) {
+            list.add(toCaseInsensitiveChar(iter.next(), Integer.MAX_VALUE));
+        }
+        list.add(Automata.makeAnyString());
+
+        Automaton a = Operations.concatenate(list);
+        a = MinimizationOperations.minimize(a, Integer.MAX_VALUE);
+        return a;
+    } 
+
+
+    /** Build an automaton query accepting all terms with the specified prefix, ASCII case insensitive. */
+    public static AutomatonQuery caseInsensitivePrefixQuery(Term prefix) {
+        return new AutomatonQuery(prefix, caseInsensitivePrefix(prefix.text()));
+    }    
+
+    /** Build an automaton accepting all terms ASCII case insensitive. */
+    public static AutomatonQuery caseInsensitiveTermQuery(Term term) {
+        BytesRef prefix = term.bytes();
+        return new AutomatonQuery(term, toCaseInsensitiveString(prefix,Integer.MAX_VALUE));
+    }    
+
+
+    /** Build an automaton matching a wildcard pattern, ASCII case insensitive. */
+    public static AutomatonQuery caseInsensitiveWildcardQuery(Term wildcardquery) {
+        return new AutomatonQuery(wildcardquery, toCaseInsensitiveWildcardAutomaton(wildcardquery,Integer.MAX_VALUE));
+    }    
+
+
+    /** String equality with support for wildcards */
+    public static final char WILDCARD_STRING = '*';
+
+    /** Char equality with support for wildcards */
+    public static final char WILDCARD_CHAR = '?';
+
+    /** Escape character */
+    public static final char WILDCARD_ESCAPE = '\\';    
+    /**
+     * Convert Lucene wildcard syntax into an automaton.
+     */
+    @SuppressWarnings("fallthrough")
+    public static Automaton toCaseInsensitiveWildcardAutomaton(Term wildcardquery, int maxDeterminizedStates) {
+      List<Automaton> automata = new ArrayList<>();
+
+      String wildcardText = wildcardquery.text();
+
+      for (int i = 0; i < wildcardText.length();) {
+        final int c = wildcardText.codePointAt(i);
+        int length = Character.charCount(c);
+        switch(c) {
+          case WILDCARD_STRING: 
+            automata.add(Automata.makeAnyString());
+            break;
+          case WILDCARD_CHAR:
+            automata.add(Automata.makeAnyChar());
+            break;
+          case WILDCARD_ESCAPE:
+            // add the next codepoint instead, if it exists
+            if (i + length < wildcardText.length()) {
+              final int nextChar = wildcardText.codePointAt(i + length);
+              length += Character.charCount(nextChar);
+              automata.add(Automata.makeChar(nextChar));
+              break;
+            } // else fallthru, lenient parsing with a trailing \
+          default:
+            automata.add(toCaseInsensitiveChar(c, maxDeterminizedStates));
+        }
+        i += length;
+      }
+
+      return Operations.concatenate(automata);
+    }    
+
+    protected static Automaton toCaseInsensitiveString(BytesRef br, int maxDeterminizedStates) {
+        return toCaseInsensitiveString(br.utf8ToString(), maxDeterminizedStates);
+    }
+
+    public static Automaton toCaseInsensitiveString(String s, int maxDeterminizedStates) {
+        List<Automaton> list = new ArrayList<>();
+        Iterator<Integer> iter = s.codePoints().iterator();
+        while (iter.hasNext()) {
+            list.add(toCaseInsensitiveChar(iter.next(), maxDeterminizedStates));
+        }
+
+        Automaton a = Operations.concatenate(list);
+        a = MinimizationOperations.minimize(a, maxDeterminizedStates);
+        return a;
+
+
+    }
+
+    protected static Automaton toCaseInsensitiveChar(int codepoint, int maxDeterminizedStates) {
+        Automaton case1 = Automata.makeChar(codepoint);
+        // For now we only work with ASCII characters
+        if (codepoint > 128) {
+            return case1;
+        }
+        int altCase = Character.isLowerCase(codepoint) ? Character.toUpperCase(codepoint) : Character.toLowerCase(codepoint);
+        Automaton result;
+        if (altCase != codepoint) {
+            result = Operations.union(case1, Automata.makeChar(altCase));
+            result = MinimizationOperations.minimize(result, maxDeterminizedStates);
+        } else {
+            result = case1;
+        }
+        return result;
+    }
+}
diff --git a/server/src/main/java/org/elasticsearch/common/regex/Regex.java b/server/src/main/java/org/elasticsearch/common/regex/Regex.java
@@ -79,15 +79,39 @@ public static Automaton simpleMatchToAutomaton(String... patterns) {
      * Match a String against the given pattern, supporting the following simple
      * pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
      * arbitrary number of pattern parts), as well as direct equality.
+     * Matching is case sensitive.
      *
      * @param pattern the pattern to match against
      * @param str     the String to match
      * @return whether the String matches the given pattern
      */
     public static boolean simpleMatch(String pattern, String str) {
+        return simpleMatch(pattern, str, false);
+    }
+
+
+    /**
+     * Match a String against the given pattern, supporting the following simple
+     * pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an
+     * arbitrary number of pattern parts), as well as direct equality.
+     *
+     * @param pattern the pattern to match against
+     * @param str     the String to match
+     * @param caseInsensitive  true if ASCII case differences should be ignored
+     * @return whether the String matches the given pattern
+     */
+    public static boolean simpleMatch(String pattern, String str, boolean caseInsensitive) {
         if (pattern == null || str == null) {
             return false;
         }
+        if (caseInsensitive) {
+            pattern = Strings.toLowercaseAscii(pattern);
+            str = Strings.toLowercaseAscii(str);
+        }
+        return simpleMatchWithNormalizedStrings(pattern, str);
+    }
+
+    private static boolean simpleMatchWithNormalizedStrings(String pattern, String str) {
         final int firstIndex = pattern.indexOf('*');
         if (firstIndex == -1) {
             return pattern.equals(str);
@@ -102,12 +126,12 @@ public static boolean simpleMatch(String pattern, String str) {
                 return str.regionMatches(str.length() - pattern.length() + 1, pattern, 1, pattern.length() - 1);
             } else if (nextIndex == 1) {
                 // Double wildcard "**" - skipping the first "*"
-                return simpleMatch(pattern.substring(1), str);
+                return simpleMatchWithNormalizedStrings(pattern.substring(1), str);
             }
             final String part = pattern.substring(1, nextIndex);
             int partIndex = str.indexOf(part);
             while (partIndex != -1) {
-                if (simpleMatch(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
+                if (simpleMatchWithNormalizedStrings(pattern.substring(nextIndex), str.substring(partIndex + part.length()))) {
                     return true;
                 }
                 partIndex = str.indexOf(part, partIndex + 1);
@@ -116,9 +140,9 @@ public static boolean simpleMatch(String pattern, String str) {
         }
         return str.regionMatches(0, pattern, 0, firstIndex)
             && (firstIndex == pattern.length() - 1 // only wildcard in pattern is at the end, so no need to look at the rest of the string
-                || simpleMatch(pattern.substring(firstIndex), str.substring(firstIndex)));
-    }
-
+                || simpleMatchWithNormalizedStrings(pattern.substring(firstIndex), str.substring(firstIndex)));
+    }    
+    
     /**
      * Match a String against the given patterns, supporting the following simple
      * pattern styles: "xxx*", "*xxx", "*xxx*" and "xxx*yyy" matches (with an