Refactor nested field handling in FieldFetcher (#97683) (#97897)

The current recursive nested field handling implementation in FieldFetcher can be O(n^2) in the number of nested mappings, whether or not a nested field has been requested or not. For indexes with a very large number of nested fields, this can mean it takes multiple seconds to build a FieldFetcher, making the fetch phase of queries extremely slow, even if no nested fields are actually asked for. This commit reworks the logic so that building nested fetchers is only O(n log n) in the number of nested mappers; additionally, we only pay this cost for nested fields that have been requested.
elastic · Jul 25, 2023 · 72eb609 · 72eb609
1 parent 92403f7
commit 72eb609
Show file tree

Hide file tree

Showing 7 changed files with 569 additions and 223 deletions.
diff --git a/docs/changelog/97683.yaml b/docs/changelog/97683.yaml
@@ -0,0 +1,5 @@
+pr: 97683
+summary: Refactor nested field handling in `FieldFetcher`
+area: Search
+type: enhancement
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/action/search/FieldsOptionSourceAdapter.java b/server/src/main/java/org/elasticsearch/action/search/FieldsOptionSourceAdapter.java
@@ -7,15 +7,14 @@
  */
 package org.elasticsearch.action.search;
 
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.elasticsearch.Version;
 import org.elasticsearch.common.document.DocumentField;
-import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.search.builder.SearchSourceBuilder;
 import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
 import org.elasticsearch.search.fetch.subphase.FieldAndFormat;
 import org.elasticsearch.search.fetch.subphase.FieldFetcher;
+import org.elasticsearch.search.fetch.subphase.UnmappedFieldFetcher;
 import org.elasticsearch.search.lookup.SourceLookup;
 
 import java.io.IOException;
@@ -95,22 +94,9 @@ public void adaptResponse(Version connectionVersion, SearchHit[] hits) {
                 Map<String, DocumentField> documentFields = Collections.emptyMap();
                 try {
                     if (fieldFetcher == null) {
-                        CharacterRunAutomaton unmappedFieldsFetchAutomaton = null;
-                        // We separate the "include_unmapped" field patters with wildcards from the rest in order to use less space in the
-                        // lookup automaton
-                        Map<Boolean, List<String>> partitions = originalSource.fetchFields()
-                            .stream()
-                            .map(ff -> ff.field)
-                            .collect(Collectors.partitioningBy((s -> Regex.isSimpleMatchPattern(s))));
-                        List<String> unmappedWildcardPattern = partitions.get(true);
-                        List<String> unmappedConcreteFields = partitions.get(false);
-                        if (unmappedWildcardPattern.isEmpty() == false) {
-                            unmappedFieldsFetchAutomaton = new CharacterRunAutomaton(
-                                Regex.simpleMatchToAutomaton(unmappedWildcardPattern.toArray(new String[unmappedWildcardPattern.size()])),
-                                100000
-                            );
-                        }
-                        fieldFetcher = new FieldFetcher(Collections.emptyMap(), unmappedFieldsFetchAutomaton, unmappedConcreteFields);
+                        List<String> fieldPatterns = originalSource.fetchFields().stream().map(ff -> ff.field).collect(Collectors.toList());
+                        UnmappedFieldFetcher unmappedFieldFetcher = new UnmappedFieldFetcher(Collections.emptySet(), fieldPatterns);
+                        fieldFetcher = new FieldFetcher(Collections.emptyMap(), unmappedFieldFetcher);
 
                     }
                     documentFields = fieldFetcher.fetch(lookup);

diff --git a/server/src/main/java/org/elasticsearch/index/query/SearchExecutionContext.java b/server/src/main/java/org/elasticsearch/index/query/SearchExecutionContext.java
@@ -64,6 +64,7 @@
 import org.elasticsearch.xcontent.XContentParser;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
@@ -78,6 +79,7 @@
 import java.util.function.LongSupplier;
 import java.util.function.Predicate;
 import java.util.function.Supplier;
+import java.util.stream.Collectors;
 
 import static java.util.Collections.unmodifiableMap;
 
@@ -724,6 +726,37 @@ public String getNestedParent(String nestedPath) {
         return mappingLookup.getNestedParent(nestedPath);
     }
 
+    public List<String> getImmediateChildMappers(String path) {
+        List<String> nestedPathNames = mappingLookup.getNestedMappers()
+            .stream()
+            .map(ObjectMapper::name)
+            .sorted()
+            .collect(Collectors.toList());
+        String prefix = "".equals(path) ? "" : path + ".";
+        List<String> childMappers = new ArrayList<>();
+        int parentPos = Collections.binarySearch(nestedPathNames, path);
+        if (parentPos < -1 || parentPos >= nestedPathNames.size() - 1) {
+            return Collections.emptyList();
+        }
+        int i = parentPos + 1;
+        String lastChild = nestedPathNames.get(i);
+        if (lastChild.startsWith(prefix)) {
+            childMappers.add(lastChild);
+        }
+        i++;
+        while (i < nestedPathNames.size() && nestedPathNames.get(i).startsWith(prefix)) {
+            if (nestedPathNames.get(i).startsWith(lastChild + ".")) {
+                // child of child, skip
+                i++;
+                continue;
+            }
+            lastChild = nestedPathNames.get(i);
+            childMappers.add(lastChild);
+            i++;
+        }
+        return childMappers;
+    }
+
     public NestedDocuments getNestedDocuments() {
         return new NestedDocuments(mappingLookup, indexVersionCreated(), bitsetFilterCache::getBitSetProducer);
     }

diff --git a/server/src/main/java/org/elasticsearch/search/NestedUtils.java b/server/src/main/java/org/elasticsearch/search/NestedUtils.java
@@ -0,0 +1,132 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.search;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Function;
+
+/**
+ * Utility methods for dealing with nested mappers
+ */
+public final class NestedUtils {
+
+    private NestedUtils() {}
+
+    /**
+     * Partition a set of input objects by the children of a specific nested scope
+     *
+     * The returned map will contain an entry for all children, even if some of them
+     * are empty in the inputs.
+     *
+     * All children, and all input paths, must begin with the scope.  Both children
+     * and inputs should be in sorted order.
+     *
+     * @param scope         the nested scope to base partitions on
+     * @param children      the immediate children of the nested scope
+     * @param inputs        a set of inputs to partition
+     * @param pathFunction  a function to retrieve a path for each input
+     * @param <T>           the type of the inputs
+     * @return              a map of nested paths to lists of inputs
+     */
+    public static <T> Map<String, List<T>> partitionByChildren(
+        String scope,
+        List<String> children,
+        List<T> inputs,
+        Function<T, String> pathFunction
+    ) {
+        // No immediate nested children, so we can shortcut and just return all inputs
+        // under the current scope
+        if (children.isEmpty()) {
+            return Collections.singletonMap(scope, inputs);
+        }
+
+        // Set up the output map, with one entry for the current scope and one for each
+        // of its children
+        Map<String, List<T>> output = new HashMap<>();
+        output.put(scope, new ArrayList<>());
+        for (String child : children) {
+            output.put(child, new ArrayList<>());
+        }
+
+        // No inputs, so we can return the output map with all entries empty
+        if (inputs.isEmpty()) {
+            return output;
+        }
+
+        Iterator<String> childrenIterator = children.iterator();
+        String currentChild = childrenIterator.next();
+        Iterator<T> inputIterator = inputs.iterator();
+        T currentInput = inputIterator.next();
+        String currentInputName = pathFunction.apply(currentInput);
+        assert currentInputName.startsWith(scope);
+
+        // Find all the inputs that sort before the first child, and add them to the current scope entry
+        while (currentInputName.compareTo(currentChild) < 0) {
+            output.get(scope).add(currentInput);
+            if (inputIterator.hasNext() == false) {
+                return output;
+            }
+            currentInput = inputIterator.next();
+            currentInputName = pathFunction.apply(currentInput);
+            assert currentInputName.startsWith(scope);
+        }
+
+        // Iterate through all the children
+        while (currentChild != null) {
+            if (currentInputName.startsWith(currentChild + ".")) {
+                // If this input sits under the current child, add it to that child scope
+                // and then get the next input
+                output.get(currentChild).add(currentInput);
+                if (inputIterator.hasNext() == false) {
+                    // return if no more inputs
+                    return output;
+                }
+                currentInput = inputIterator.next();
+                currentInputName = pathFunction.apply(currentInput);
+                assert currentInputName.startsWith(scope);
+            } else {
+                // If there are no more children then skip to filling up the parent scope again
+                if (childrenIterator.hasNext() == false) {
+                    break;
+                }
+                // Move to the next child
+                currentChild = childrenIterator.next();
+                if (currentChild == null || currentInputName.compareTo(currentChild) < 0) {
+                    // If we still sort before the next child, then add to the parent scope
+                    // and move to the next input
+                    output.get(scope).add(currentInput);
+                    if (inputIterator.hasNext() == false) {
+                        // if no more inputs then return
+                        return output;
+                    }
+                    currentInput = inputIterator.next();
+                    currentInputName = pathFunction.apply(currentInput);
+                    assert currentInputName.startsWith(scope);
+                }
+            }
+        }
+        output.get(scope).add(currentInput);
+
+        // if there are inputs left, then they all sort after the last child but
+        // are not contained by them, so just add them all to the parent scope
+        while (inputIterator.hasNext()) {
+            currentInput = inputIterator.next();
+            currentInputName = pathFunction.apply(currentInput);
+            assert currentInputName.startsWith(scope);
+            output.get(scope).add(currentInput);
+        }
+        return output;
+    }
+
+}