SOLR-11831: Skip second grouping step if group.limit is 1 (aka Las Ve…

…gas patch) Summary: In cases where we do grouping and ask for {{group.limit=1}} only it is possible to skip the second grouping step. In our test datasets it improved speed by around 40%. Essentially, in the first grouping step each shard returns the top K groups based on the highest scoring document in each group. The top K groups from each shard are merged in the federator and in the second step we ask all the shards to return the top documents from each of the top ranking groups. If we only want to return the highest scoring document per group we can return the top document id in the first step, merge results in the federator to retain the top K groups and then skip the second grouping step entirely. QueryComponent: interim 'make it compile (somehow)' change (apache#228) add SearchGroupsContainer (apache#230) factor out SearchGroupsResultTransformer.serializeOneSearchGroup method (Christine) Refactor transformToNative adding deserializeOneSearchGroup increase GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND use (see also 6bdf87) Remove error logging in allowSkipSecondGroupingStep Check that withinGroupSort is a prefix of groupSort SkipSecondStepSearchGroupShardResponseProcessor.addSearchGroupToShards now leaves ShardDoc.fields null factor out TopGroupsShardResponseProcessor.fillResultIds method Add regression test group.main=true and group.format=simple Improve GroupingSpecification validation adding validate() method SkipSecondStepSearchResultResultTransformer.serializeOneSearchGroup tweaks
diegoceccarelli · Dec 4, 2019 · 370b51b · 370b51b
1 parent f71c2c8
commit 370b51b
Show file tree

Hide file tree

Showing 11 changed files with 394 additions and 24 deletions.
diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/FirstPassGroupingCollector.java
@@ -139,10 +139,18 @@ public Collection<SearchGroup<T>> getTopGroups(int groupOffset) throws IOExcepti
       // System.out.println("  group=" + (group.groupValue == null ? "null" : group.groupValue.toString()));
       SearchGroup<T> searchGroup = new SearchGroup<>();
       searchGroup.groupValue = group.groupValue;
+      // We pass this around so that we can get the corresponding solr id when serializing the search group to send to the federator
+      searchGroup.topDocLuceneId = group.topDoc;
       searchGroup.sortValues = new Object[sortFieldCount];
       for(int sortFieldIDX=0;sortFieldIDX<sortFieldCount;sortFieldIDX++) {
         searchGroup.sortValues[sortFieldIDX] = comparators[sortFieldIDX].value(group.comparatorSlot);
       }
+      // TODO: It should be possible to extend this to handle more than one FieldComparator and other types
+      if (sortFieldCount > 0 && comparators[0] instanceof FieldComparator.RelevanceComparator ){
+        searchGroup.topDocScore = (Float)comparators[0].value(group.comparatorSlot);
+      } else {
+        searchGroup.topDocScore = -1;
+      }
       result.add(searchGroup);
     }
     //System.out.println("  return " + result.size() + " groups");

diff --git a/lucene/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java b/lucene/grouping/src/java/org/apache/lucene/search/grouping/SearchGroup.java
@@ -48,9 +48,25 @@ public class SearchGroup<T> {
    * been passed to {@link FirstPassGroupingCollector#getTopGroups} */
   public Object[] sortValues;
 
+  /** The top doc of this group: we track the Lucene id,
+   * the Solr id and the score of the document */
+  public Object topDocSolrId;
+  public float topDocScore;
+
+  /** The topDocLuceneId will be null at the federator level because it is unique only at the shard level.
+   * It is used by the shard to get the corresponding solr id when serializing the search group to send to the federator
+   */
+  public int topDocLuceneId;
+
   @Override
   public String toString() {
-    return("SearchGroup(groupValue=" + groupValue + " sortValues=" + Arrays.toString(sortValues) + ")");
+    return "SearchGroup{" +
+        "groupValue=" + groupValue +
+        ", sortValues=" + Arrays.toString(sortValues) +
+        ", topDocSolrId=" + topDocSolrId +
+        ", topDocScore=" + topDocScore +
+        ", topDocLuceneId=" + topDocLuceneId +
+        '}';
   }
 
   @Override
@@ -113,6 +129,11 @@ private static class MergedGroup<T> {
     public boolean processed;
     public boolean inQueue;
 
+    /** The top doc of this group:
+     * the Solr id and the score of the document */
+    public float topDocScore;
+    public Object topDocSolrId;
+
     public MergedGroup(T groupValue) {
       this.groupValue = groupValue;
     }
@@ -225,6 +246,8 @@ private void updateNextGroup(int topN, ShardIter<T> shard) {
           // Start a new group:
           //System.out.println("      new");
           mergedGroup = new MergedGroup<>(group.groupValue);
+          mergedGroup.topDocSolrId = group.topDocSolrId;
+          mergedGroup.topDocScore = group.topDocScore;
           mergedGroup.minShardIndex = shard.shardIndex;
           assert group.sortValues != null;
           mergedGroup.topValues = group.sortValues;
@@ -262,6 +285,8 @@ private void updateNextGroup(int topN, ShardIter<T> shard) {
             if (mergedGroup.inQueue) {
               queue.remove(mergedGroup);
             }
+            mergedGroup.topDocScore = group.topDocScore;
+            mergedGroup.topDocSolrId = group.topDocSolrId;
             mergedGroup.topValues = group.sortValues;
             mergedGroup.minShardIndex = shard.shardIndex;
             queue.add(mergedGroup);
@@ -308,6 +333,8 @@ public Collection<SearchGroup<T>> merge(List<Collection<SearchGroup<T>>> shards,
           final SearchGroup<T> newGroup = new SearchGroup<>();
           newGroup.groupValue = group.groupValue;
           newGroup.sortValues = group.topValues;
+          newGroup.topDocSolrId = group.topDocSolrId;
+          newGroup.topDocScore = group.topDocScore;
           newTopGroups.add(newGroup);
           if (newTopGroups.size() == topN) {
             break;

diff --git a/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java b/solr/core/src/java/org/apache/solr/handler/component/QueryComponent.java
@@ -99,6 +99,7 @@
 import org.apache.solr.search.grouping.distributed.requestfactory.StoredFieldsShardRequestFactory;
 import org.apache.solr.search.grouping.distributed.requestfactory.TopGroupsShardRequestFactory;
 import org.apache.solr.search.grouping.distributed.responseprocessor.SearchGroupShardResponseProcessor;
+import org.apache.solr.search.grouping.distributed.responseprocessor.SkipSecondStepSearchGroupShardResponseProcessor;
 import org.apache.solr.search.grouping.distributed.responseprocessor.StoredFieldsShardResponseProcessor;
 import org.apache.solr.search.grouping.distributed.responseprocessor.TopGroupsShardResponseProcessor;
 import org.apache.solr.search.grouping.distributed.shardresultserializer.SearchGroupsResultTransformer;
@@ -305,17 +306,18 @@ protected void prepareGrouping(ResponseBuilder rb) throws IOException {
     groupingSpec.setNeedScore((rb.getFieldFlags() & SolrIndexSearcher.GET_SCORES) != 0);
     groupingSpec.setTruncateGroups(params.getBool(GroupParams.GROUP_TRUNCATE, false));
 
-    // when group.format=grouped then, validate group.offset
-    // for group.main=true and group.format=simple, start value is used instead of group.offset
-    // and start is already validate above for negative values
-    if (!(groupingSpec.isMain() || groupingSpec.getResponseFormat() == Grouping.Format.simple) &&
-        groupingSpec.getWithinGroupSortSpec().getOffset() < 0) {
-      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "'group.offset' parameter cannot be negative");
+    if (params.getBool(GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND, GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND_DEFAULT)) {
+      // skip second step is enabled
+      groupingSpec.setSkipSecondGroupingStep(true);
+      // check if reranking is enabled
+      if (rb.getRankQuery() != null) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+            GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND+" does not support reranking parameter "+CommonParams.RQ);
+      }
     }
+    groupingSpec.validate();
   }
 
-
-
   /**
    * Actually run the query
    */
@@ -547,7 +549,9 @@ protected int groupedDistributedProcess(ResponseBuilder rb) {
     } else if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) {
       nextStage = ResponseBuilder.STAGE_EXECUTE_QUERY;
     } else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) {
-      shardRequestFactory = new TopGroupsShardRequestFactory();
+      if (!rb.getGroupingSpec().isSkipSecondGroupingStep()) {
+        shardRequestFactory = new TopGroupsShardRequestFactory();
+      }
       nextStage = ResponseBuilder.STAGE_GET_FIELDS;
     } else if (rb.stage < ResponseBuilder.STAGE_GET_FIELDS) {
       nextStage = ResponseBuilder.STAGE_GET_FIELDS;
@@ -593,10 +597,18 @@ public void handleResponses(ResponseBuilder rb, ShardRequest sreq) {
     }
   }
 
+  protected SearchGroupShardResponseProcessor newSearchGroupShardResponseProcessor(ResponseBuilder rb) {
+    if (rb.getGroupingSpec().isSkipSecondGroupingStep()) {
+      return new SkipSecondStepSearchGroupShardResponseProcessor();
+    } else {
+      return new SearchGroupShardResponseProcessor();
+    }
+  }
+
   protected void handleGroupedResponses(ResponseBuilder rb, ShardRequest sreq) {
     ShardResponseProcessor responseProcessor = null;
     if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_GROUPS) != 0) {
-      responseProcessor = new SearchGroupShardResponseProcessor();
+      responseProcessor = newSearchGroupShardResponseProcessor(rb);
     } else if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
       responseProcessor = new TopGroupsShardResponseProcessor();
     } else if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
@@ -1287,6 +1299,14 @@ private boolean doProcessSearchByIds(ResponseBuilder rb) throws IOException {
     return true;
   }
 
+  protected SearchGroupsResultTransformer newSearchGroupsResultTransformer(ResponseBuilder rb, SolrIndexSearcher searcher) {
+    if (rb.getGroupingSpec().isSkipSecondGroupingStep()) {
+      return new SearchGroupsResultTransformer.SkipSecondStepSearchResultResultTransformer(searcher);
+    } else {
+      return new SearchGroupsResultTransformer(searcher);
+    }
+  }
+
   private void doProcessGroupedDistributedSearchFirstPhase(ResponseBuilder rb, QueryCommand cmd, QueryResult result) throws IOException {
 
     GroupingSpecification groupingSpec = rb.getGroupingSpec();
@@ -1316,7 +1336,7 @@ private void doProcessGroupedDistributedSearchFirstPhase(ResponseBuilder rb, Que
 
     CommandHandler commandHandler = topsGroupsActionBuilder.build();
     commandHandler.execute();
-    SearchGroupsResultTransformer serializer = new SearchGroupsResultTransformer(searcher);
+    SearchGroupsResultTransformer serializer = newSearchGroupsResultTransformer(rb, searcher);
 
     rsp.add("firstPhase", commandHandler.processResult(result, serializer));
     rsp.add("totalHitCount", commandHandler.getTotalHitCount());

diff --git a/solr/core/src/java/org/apache/solr/search/grouping/GroupingSpecification.java b/solr/core/src/java/org/apache/solr/search/grouping/GroupingSpecification.java
@@ -16,6 +16,11 @@
  */
 package org.apache.solr.search.grouping;
 
+import java.util.Arrays;
+import java.util.Collections;
+import org.apache.lucene.search.SortField;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.params.GroupParams;
 import org.apache.solr.search.Grouping;
 import org.apache.solr.search.SortSpec;
 
@@ -36,6 +41,57 @@ public class GroupingSpecification {
   private Grouping.Format responseFormat;
   private boolean needScore;
   private boolean truncateGroups;
+  /* This is an optimization to skip the second grouping step when groupLimit is 1. The second
+  * grouping step retrieves the top K documents for each group. This is not necessary when only one
+  * document per group is required because in the first step every shard sends back the group score given
+  * by its top document.
+  */
+  private boolean skipSecondGroupingStep;
+
+  /**
+   * Validates the current GropingSpecification.
+   * It will throw a SolrException the grouping specification is not valid, otherwise
+   * it will return without side effects.
+   */
+  public void validate(){
+    if (isSkipSecondGroupingStep()) {
+      validateSkipSecondGroupingStep(withinGroupSortSpec, groupSortSpec);
+    }
+
+    // when group.format=grouped then, validate group.offset
+    // for group.main=true and group.format=simple, start value is used instead of group.offset
+    // and start is already validate above for negative values
+    if (!(main || responseFormat == Grouping.Format.simple) &&
+       withinGroupSortSpec.getOffset() < 0) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "'group.offset' parameter cannot be negative");
+    }
+  }
+
+  private void validateSkipSecondGroupingStep(final SortSpec withinGroupSpecification, final SortSpec groupSort) {
+    // Only possible if we only want one doc per group
+    final int limit =  withinGroupSpecification.getCount();
+    final int offset = withinGroupSpecification.getOffset();
+    if (limit != 1) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+          GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND + " does not support " +
+          GroupParams.GROUP_LIMIT + " != 1 ("+GroupParams.GROUP_LIMIT+" is "+limit+")");
+    }
+    if (offset != 0){
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+          GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND + " does not support " + GroupParams.GROUP_OFFSET + " != 0 (" +
+              GroupParams.GROUP_OFFSET + " is "+offset + ")");
+    }
+
+    final SortField[] withinGroupSortFields = withinGroupSpecification.getSort().getSort();
+    final SortField[] groupSortFields = groupSort.getSort().getSort();
+
+    // Within group sort must be the same as group sort because if we skip second step no sorting within group will be done.
+    // This checks if withinGroupSortFields is a prefix of groupSortFields
+    if (Collections.indexOfSubList(Arrays.asList(groupSortFields), Arrays.asList(withinGroupSortFields)) != 0) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+          GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND + " does not allow the given within/global sort group configuration");
+    }
+  }
 
   public String[] getFields() {
     return fields;
@@ -129,4 +185,8 @@ public void setWithinGroupSortSpec(SortSpec withinGroupSortSpec) {
     this.withinGroupSortSpec = withinGroupSortSpec;
   }
 
+  public boolean isSkipSecondGroupingStep() { return skipSecondGroupingStep; }
+
+  public void setSkipSecondGroupingStep(boolean skipSecondGroupingStep) { this.skipSecondGroupingStep = skipSecondGroupingStep; }
+
 }
diff --git a/solr/core/src/java/org/apache/solr/search/grouping/distributed/command/GroupConverter.java b/solr/core/src/java/org/apache/solr/search/grouping/distributed/command/GroupConverter.java
@@ -53,6 +53,8 @@ static Collection<SearchGroup<BytesRef>> fromMutable(SchemaField field, Collecti
     for (SearchGroup<MutableValue> original : values) {
       SearchGroup<BytesRef> converted = new SearchGroup<>();
       converted.sortValues = original.sortValues;
+      converted.topDocLuceneId = original.topDocLuceneId;
+      converted.topDocScore = original.topDocScore;
       if (original.groupValue.exists) {
         BytesRefBuilder binary = new BytesRefBuilder();
         fieldType.readableToIndexed(Utils.OBJECT_TO_STRING.apply(original.groupValue.toObject()), binary);

diff --git a/...ouping/distributed/responseprocessor/SkipSecondStepSearchGroupShardResponseProcessor.java b/...ouping/distributed/responseprocessor/SkipSecondStepSearchGroupShardResponseProcessor.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search.grouping.distributed.responseprocessor;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.TotalHits;
+import org.apache.lucene.search.grouping.GroupDocs;
+import org.apache.lucene.search.grouping.SearchGroup;
+import org.apache.lucene.search.grouping.TopGroups;
+import org.apache.lucene.util.BytesRef;
+import org.apache.solr.handler.component.ResponseBuilder;
+import org.apache.solr.handler.component.ShardDoc;
+import org.apache.solr.handler.component.ShardRequest;
+import org.apache.solr.handler.component.ShardResponse;
+import org.apache.solr.search.SolrIndexSearcher;
+import org.apache.solr.search.grouping.GroupingSpecification;
+import org.apache.solr.search.grouping.distributed.shardresultserializer.SearchGroupsResultTransformer;
+
+public class SkipSecondStepSearchGroupShardResponseProcessor extends SearchGroupShardResponseProcessor {
+
+  @Override
+  protected SearchGroupsResultTransformer newSearchGroupsResultTransformer(SolrIndexSearcher solrIndexSearcher) {
+    return new SearchGroupsResultTransformer.SkipSecondStepSearchResultResultTransformer(solrIndexSearcher);
+  }
+
+  @Override
+  protected SearchGroupsContainer newSearchGroupsContainer(ResponseBuilder rb) {
+    return new SkipSecondStepSearchGroupsContainer(rb.getGroupingSpec().getFields());
+  }
+
+  @Override
+  public void process(ResponseBuilder rb, ShardRequest shardRequest) {
+    super.process(rb, shardRequest);
+    TopGroupsShardResponseProcessor.fillResultIds(rb);
+  }
+
+  protected static class SkipSecondStepSearchGroupsContainer extends SearchGroupsContainer {
+
+    private final Map<Object, String> docIdToShard = new HashMap<>();
+
+    public SkipSecondStepSearchGroupsContainer(String[] fields) {
+      super(fields);
+    }
+
+    @Override
+    public void addSearchGroups(ShardResponse srsp, String field, Collection<SearchGroup<BytesRef>> searchGroups) {
+      super.addSearchGroups(srsp, field, searchGroups);
+      for (SearchGroup<BytesRef> searchGroup : searchGroups) {
+        assert(srsp.getShard() != null);
+        docIdToShard.put(searchGroup.topDocSolrId, srsp.getShard());
+      }
+    }
+
+    @Override
+    public void addMergedSearchGroups(ResponseBuilder rb, String groupField, Collection<SearchGroup<BytesRef>> mergedTopGroups ) {
+      // TODO: add comment or javadoc re: why this method is overridden as a no-op
+    }
+
+    @Override
+    public void addSearchGroupToShards(ResponseBuilder rb, String groupField, Collection<SearchGroup<BytesRef>> mergedTopGroups) {
+      super.addSearchGroupToShards(rb, groupField, mergedTopGroups);
+
+      final GroupingSpecification groupingSpecification = rb.getGroupingSpec();
+      final Sort groupSort = groupingSpecification.getGroupSortSpec().getSort();
+
+      GroupDocs<BytesRef>[] groups = new GroupDocs[mergedTopGroups.size()];
+
+      // This is the max score found in any document on any group
+      float maxScore = 0;
+      int index = 0;
+
+      for (SearchGroup<BytesRef> group : mergedTopGroups) {
+        maxScore = Math.max(maxScore, group.topDocScore);
+        final String shard = docIdToShard.get(group.topDocSolrId);
+        assert(shard != null);
+        final ShardDoc sdoc = new ShardDoc();
+        sdoc.score = group.topDocScore;
+        sdoc.id = group.topDocSolrId;
+        sdoc.shard = shard;
+
+        groups[index++] = new GroupDocs<>(group.topDocScore,
+            group.topDocScore,
+            new TotalHits(1, TotalHits.Relation.EQUAL_TO), /* we don't know the actual number of hits in the group- we set it to 1 as we only keep track of the top doc */
+            new ShardDoc[] { sdoc }, /* only top doc */
+            group.groupValue,
+            group.sortValues);
+      }
+      TopGroups<BytesRef> topMergedGroups = new TopGroups<BytesRef>(groupSort.getSort(),
+          rb.getGroupingSpec().getWithinGroupSortSpec().getSort().getSort(),
+          0, /*Set totalHitCount to 0 as we can't computed it as is */
+          0, /*Set totalGroupedHitCount to 0 as we can't computed it as is*/
+          groups,
+          maxScore);
+      rb.mergedTopGroups.put(groupField, topMergedGroups);
+    }
+  }
+
+}