Correct cross_fields docs on how analyzer groups are combined. (#69936)

When performing a multi_match in cross_fields mode, we group fields based on their analyzer and create a blended query per group. Our docs claimed that the group scores were combined through a boolean query, but they are actually combined through a dismax that incorporates the tiebreaker parameter. This commit updates the docs and adds a test verifying the behavior.
elastic · Mar 8, 2021 · 92bcf6a · 92bcf6a
1 parent ef49dde
commit 92bcf6a
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 14 deletions.
diff --git a/docs/reference/query-dsl/multi-match-query.asciidoc b/docs/reference/query-dsl/multi-match-query.asciidoc
@@ -393,8 +393,8 @@ Also, accepts `analyzer`, `boost`, `operator`, `minimum_should_match`,
 
 The `cross_field` type can only work in term-centric mode on fields that have
 the same analyzer. Fields with the same analyzer are grouped together as in
-the example above.  If there are multiple groups, they are combined with a
-`bool` query.
+the example above.  If there are multiple groups, the query will use the best
+score from any group.
 
 For instance, if we have a `first` and `last` field which have
 the same analyzer, plus a `first.edge` and `last.edge` which
@@ -435,16 +435,16 @@ Having multiple groups is fine, but when combined with `operator` or
 as `most_fields` or `best_fields`.
 
 You can easily rewrite this query yourself as two separate `cross_fields`
-queries combined with a `bool` query, and apply the `minimum_should_match`
+queries combined with a `dis_max` query, and apply the `minimum_should_match`
 parameter to just one of them:
 
 [source,console]
 --------------------------------------------------
 GET /_search
 {
   "query": {
-    "bool": {
-      "should": [
+    "dis_max": {
+      "queries": [
         {
           "multi_match" : {
             "query":      "Will Smith",
@@ -498,19 +498,17 @@ which will be executed as:
 ===== `tie_breaker`
 
 By default, each per-term `blended` query will use the best score returned by
-any field in a group, then these scores are added together to give the final
-score. The `tie_breaker` parameter can change the default behaviour of the
-per-term `blended` queries. It accepts:
+any field in a group. Then when combining scores across groups, the query uses
+the best score from any group. The `tie_breaker` parameter can change the
+behavior for both of these steps:
 
 [horizontal]
 `0.0`::             Take the single best score out of (eg) `first_name:will`
-                    and `last_name:will` (*default* for all `multi_match`
-                    query types except `bool_prefix` and `most_fields`)
+                    and `last_name:will` (default)
 `1.0`::             Add together the scores for (eg) `first_name:will` and
-                    `last_name:will` (*default* for the `bool_prefix` and
-                    `most_fields` `multi_match` query types)
+                    `last_name:will`
 `0.0 < n < 1.0`::   Take the single best score plus +tie_breaker+ multiplied
-                    by each of the scores from other matching fields.
+                    by each of the scores from other matching fields/ groups
 
 [IMPORTANT]
 [[crossfields-fuzziness]]

diff --git a/server/src/test/java/org/elasticsearch/index/search/MultiMatchQueryParserTests.java b/server/src/test/java/org/elasticsearch/index/search/MultiMatchQueryParserTests.java
@@ -77,6 +77,10 @@ public void setup() throws IOException {
                 "                        \"last\": {\n" +
                 "                            \"type\":\"text\",\n" +
                 "                            \"analyzer\":\"standard\"\n" +
+                "                        }," +
+                "                        \"nickname\": {\n" +
+                "                            \"type\":\"text\",\n" +
+                "                            \"analyzer\":\"whitespace\"\n" +
                 "                        }" +
                 "                   }" +
                 "            }\n" +
@@ -288,7 +292,7 @@ public void testMultiMatchCrossFieldsWithSynonyms() throws IOException {
 
     }
 
-    public void testMultiMatchCrossFieldsWithSynonymsPhrase() throws IOException {
+    public void testCrossFieldsWithSynonymsPhrase() throws IOException {
         SearchExecutionContext searchExecutionContext = indexService.newSearchExecutionContext(
             randomInt(20),
             0,
@@ -335,6 +339,30 @@ public void testMultiMatchCrossFieldsWithSynonymsPhrase() throws IOException {
         assertEquals(expected, query);
     }
 
+    public void testCrossFieldsWithAnalyzerGroups() throws IOException {
+        SearchExecutionContext searchExecutionContext = indexService.newSearchExecutionContext(
+            randomInt(20), 0, null, () -> 0L, null, emptyMap());
+
+        Map<String, Float> fieldNames = new HashMap<>();
+        fieldNames.put("name.first", 1.0f);
+        fieldNames.put("name.last", 1.0f);
+        fieldNames.put("name.nickname", 1.0f);
+
+        MultiMatchQueryParser parser = new MultiMatchQueryParser(searchExecutionContext);
+        parser.setTieBreaker(0.3f);
+        Query query = parser.parse(MultiMatchQueryBuilder.Type.CROSS_FIELDS, fieldNames, "Robert", null);
+
+        Term[] terms = new Term[]{
+            new Term("name.first", "robert"),
+            new Term("name.last", "robert")};
+        float[] boosts = new float[] {1.0f, 1.0f};
+
+        DisjunctionMaxQuery expected = new DisjunctionMaxQuery(Arrays.asList(
+                BlendedTermQuery.dismaxBlendedQuery(terms, boosts, 0.3f),
+                new TermQuery(new Term("name.nickname", "Robert"))), 0.3f);
+        assertEquals(expected, query);
+    }
+
     public void testKeywordSplitQueriesOnWhitespace() throws IOException {
         IndexService indexService = createIndex("test_keyword", Settings.builder()
             .put("index.analysis.normalizer.my_lowercase.type", "custom")