Skip to content

Commit

Permalink
SOLR-11831: Skip second grouping step if group.limit is 1 (aka Las Ve…
Browse files Browse the repository at this point in the history
…gas patch)

Summary:
In cases where we do grouping and ask for  {{group.limit=1}} only it is possible to skip the second grouping step. In our test datasets it improved speed by around 40%.

Essentially, in the first grouping step each shard returns the top K groups based on the highest scoring document in each group. The top K groups from each shard are merged in the federator and in the second step we ask all the shards to return the top documents from each of the top ranking groups.

If we only want to return the highest scoring document per group we can return the top document id in the first step, merge results in the federator to retain the top K groups and then skip the second grouping step entirely.

QueryComponent: interim 'make it compile (somehow)' change (apache#228)

add SearchGroupsContainer (apache#230)

factor out SearchGroupsResultTransformer.serializeOneSearchGroup method (Christine)

Refactor transformToNative adding deserializeOneSearchGroup

 increase GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND use (see also 6bdf87)

Remove error logging in allowSkipSecondGroupingStep

Check that withinGroupSort is a prefix of groupSort

SkipSecondStepSearchGroupShardResponseProcessor.addSearchGroupToShards now leaves ShardDoc.fields null

factor out TopGroupsShardResponseProcessor.fillResultIds method

Add regression test group.main=true and group.format=simple

Improve GroupingSpecification validation adding validate() method

SkipSecondStepSearchResultResultTransformer.serializeOneSearchGroup tweaks
  • Loading branch information
Malvina Josephidou authored and diegoceccarelli committed Dec 4, 2019
1 parent f71c2c8 commit 370b51b
Show file tree
Hide file tree
Showing 11 changed files with 394 additions and 24 deletions.
Expand Up @@ -139,10 +139,18 @@ public Collection<SearchGroup<T>> getTopGroups(int groupOffset) throws IOExcepti
// System.out.println(" group=" + (group.groupValue == null ? "null" : group.groupValue.toString()));
SearchGroup<T> searchGroup = new SearchGroup<>();
searchGroup.groupValue = group.groupValue;
// We pass this around so that we can get the corresponding solr id when serializing the search group to send to the federator
searchGroup.topDocLuceneId = group.topDoc;
searchGroup.sortValues = new Object[sortFieldCount];
for(int sortFieldIDX=0;sortFieldIDX<sortFieldCount;sortFieldIDX++) {
searchGroup.sortValues[sortFieldIDX] = comparators[sortFieldIDX].value(group.comparatorSlot);
}
// TODO: It should be possible to extend this to handle more than one FieldComparator and other types
if (sortFieldCount > 0 && comparators[0] instanceof FieldComparator.RelevanceComparator ){
searchGroup.topDocScore = (Float)comparators[0].value(group.comparatorSlot);
} else {
searchGroup.topDocScore = -1;
}
result.add(searchGroup);
}
//System.out.println(" return " + result.size() + " groups");
Expand Down
Expand Up @@ -48,9 +48,25 @@ public class SearchGroup<T> {
* been passed to {@link FirstPassGroupingCollector#getTopGroups} */
public Object[] sortValues;

/** The top doc of this group: we track the Lucene id,
* the Solr id and the score of the document */
public Object topDocSolrId;
public float topDocScore;

/** The topDocLuceneId will be null at the federator level because it is unique only at the shard level.
* It is used by the shard to get the corresponding solr id when serializing the search group to send to the federator
*/
public int topDocLuceneId;

@Override
public String toString() {
return("SearchGroup(groupValue=" + groupValue + " sortValues=" + Arrays.toString(sortValues) + ")");
return "SearchGroup{" +
"groupValue=" + groupValue +
", sortValues=" + Arrays.toString(sortValues) +
", topDocSolrId=" + topDocSolrId +
", topDocScore=" + topDocScore +
", topDocLuceneId=" + topDocLuceneId +
'}';
}

@Override
Expand Down Expand Up @@ -113,6 +129,11 @@ private static class MergedGroup<T> {
public boolean processed;
public boolean inQueue;

/** The top doc of this group:
* the Solr id and the score of the document */
public float topDocScore;
public Object topDocSolrId;

public MergedGroup(T groupValue) {
this.groupValue = groupValue;
}
Expand Down Expand Up @@ -225,6 +246,8 @@ private void updateNextGroup(int topN, ShardIter<T> shard) {
// Start a new group:
//System.out.println(" new");
mergedGroup = new MergedGroup<>(group.groupValue);
mergedGroup.topDocSolrId = group.topDocSolrId;
mergedGroup.topDocScore = group.topDocScore;
mergedGroup.minShardIndex = shard.shardIndex;
assert group.sortValues != null;
mergedGroup.topValues = group.sortValues;
Expand Down Expand Up @@ -262,6 +285,8 @@ private void updateNextGroup(int topN, ShardIter<T> shard) {
if (mergedGroup.inQueue) {
queue.remove(mergedGroup);
}
mergedGroup.topDocScore = group.topDocScore;
mergedGroup.topDocSolrId = group.topDocSolrId;
mergedGroup.topValues = group.sortValues;
mergedGroup.minShardIndex = shard.shardIndex;
queue.add(mergedGroup);
Expand Down Expand Up @@ -308,6 +333,8 @@ public Collection<SearchGroup<T>> merge(List<Collection<SearchGroup<T>>> shards,
final SearchGroup<T> newGroup = new SearchGroup<>();
newGroup.groupValue = group.groupValue;
newGroup.sortValues = group.topValues;
newGroup.topDocSolrId = group.topDocSolrId;
newGroup.topDocScore = group.topDocScore;
newTopGroups.add(newGroup);
if (newTopGroups.size() == topN) {
break;
Expand Down
Expand Up @@ -99,6 +99,7 @@
import org.apache.solr.search.grouping.distributed.requestfactory.StoredFieldsShardRequestFactory;
import org.apache.solr.search.grouping.distributed.requestfactory.TopGroupsShardRequestFactory;
import org.apache.solr.search.grouping.distributed.responseprocessor.SearchGroupShardResponseProcessor;
import org.apache.solr.search.grouping.distributed.responseprocessor.SkipSecondStepSearchGroupShardResponseProcessor;
import org.apache.solr.search.grouping.distributed.responseprocessor.StoredFieldsShardResponseProcessor;
import org.apache.solr.search.grouping.distributed.responseprocessor.TopGroupsShardResponseProcessor;
import org.apache.solr.search.grouping.distributed.shardresultserializer.SearchGroupsResultTransformer;
Expand Down Expand Up @@ -305,17 +306,18 @@ protected void prepareGrouping(ResponseBuilder rb) throws IOException {
groupingSpec.setNeedScore((rb.getFieldFlags() & SolrIndexSearcher.GET_SCORES) != 0);
groupingSpec.setTruncateGroups(params.getBool(GroupParams.GROUP_TRUNCATE, false));

// when group.format=grouped then, validate group.offset
// for group.main=true and group.format=simple, start value is used instead of group.offset
// and start is already validate above for negative values
if (!(groupingSpec.isMain() || groupingSpec.getResponseFormat() == Grouping.Format.simple) &&
groupingSpec.getWithinGroupSortSpec().getOffset() < 0) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "'group.offset' parameter cannot be negative");
if (params.getBool(GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND, GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND_DEFAULT)) {
// skip second step is enabled
groupingSpec.setSkipSecondGroupingStep(true);
// check if reranking is enabled
if (rb.getRankQuery() != null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND+" does not support reranking parameter "+CommonParams.RQ);
}
}
groupingSpec.validate();
}



/**
* Actually run the query
*/
Expand Down Expand Up @@ -547,7 +549,9 @@ protected int groupedDistributedProcess(ResponseBuilder rb) {
} else if (rb.stage < ResponseBuilder.STAGE_EXECUTE_QUERY) {
nextStage = ResponseBuilder.STAGE_EXECUTE_QUERY;
} else if (rb.stage == ResponseBuilder.STAGE_EXECUTE_QUERY) {
shardRequestFactory = new TopGroupsShardRequestFactory();
if (!rb.getGroupingSpec().isSkipSecondGroupingStep()) {
shardRequestFactory = new TopGroupsShardRequestFactory();
}
nextStage = ResponseBuilder.STAGE_GET_FIELDS;
} else if (rb.stage < ResponseBuilder.STAGE_GET_FIELDS) {
nextStage = ResponseBuilder.STAGE_GET_FIELDS;
Expand Down Expand Up @@ -593,10 +597,18 @@ public void handleResponses(ResponseBuilder rb, ShardRequest sreq) {
}
}

protected SearchGroupShardResponseProcessor newSearchGroupShardResponseProcessor(ResponseBuilder rb) {
if (rb.getGroupingSpec().isSkipSecondGroupingStep()) {
return new SkipSecondStepSearchGroupShardResponseProcessor();
} else {
return new SearchGroupShardResponseProcessor();
}
}

protected void handleGroupedResponses(ResponseBuilder rb, ShardRequest sreq) {
ShardResponseProcessor responseProcessor = null;
if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_GROUPS) != 0) {
responseProcessor = new SearchGroupShardResponseProcessor();
responseProcessor = newSearchGroupShardResponseProcessor(rb);
} else if ((sreq.purpose & ShardRequest.PURPOSE_GET_TOP_IDS) != 0) {
responseProcessor = new TopGroupsShardResponseProcessor();
} else if ((sreq.purpose & ShardRequest.PURPOSE_GET_FIELDS) != 0) {
Expand Down Expand Up @@ -1287,6 +1299,14 @@ private boolean doProcessSearchByIds(ResponseBuilder rb) throws IOException {
return true;
}

protected SearchGroupsResultTransformer newSearchGroupsResultTransformer(ResponseBuilder rb, SolrIndexSearcher searcher) {
if (rb.getGroupingSpec().isSkipSecondGroupingStep()) {
return new SearchGroupsResultTransformer.SkipSecondStepSearchResultResultTransformer(searcher);
} else {
return new SearchGroupsResultTransformer(searcher);
}
}

private void doProcessGroupedDistributedSearchFirstPhase(ResponseBuilder rb, QueryCommand cmd, QueryResult result) throws IOException {

GroupingSpecification groupingSpec = rb.getGroupingSpec();
Expand Down Expand Up @@ -1316,7 +1336,7 @@ private void doProcessGroupedDistributedSearchFirstPhase(ResponseBuilder rb, Que

CommandHandler commandHandler = topsGroupsActionBuilder.build();
commandHandler.execute();
SearchGroupsResultTransformer serializer = new SearchGroupsResultTransformer(searcher);
SearchGroupsResultTransformer serializer = newSearchGroupsResultTransformer(rb, searcher);

rsp.add("firstPhase", commandHandler.processResult(result, serializer));
rsp.add("totalHitCount", commandHandler.getTotalHitCount());
Expand Down
Expand Up @@ -16,6 +16,11 @@
*/
package org.apache.solr.search.grouping;

import java.util.Arrays;
import java.util.Collections;
import org.apache.lucene.search.SortField;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.GroupParams;
import org.apache.solr.search.Grouping;
import org.apache.solr.search.SortSpec;

Expand All @@ -36,6 +41,57 @@ public class GroupingSpecification {
private Grouping.Format responseFormat;
private boolean needScore;
private boolean truncateGroups;
/* This is an optimization to skip the second grouping step when groupLimit is 1. The second
* grouping step retrieves the top K documents for each group. This is not necessary when only one
* document per group is required because in the first step every shard sends back the group score given
* by its top document.
*/
private boolean skipSecondGroupingStep;

/**
* Validates the current GropingSpecification.
* It will throw a SolrException the grouping specification is not valid, otherwise
* it will return without side effects.
*/
public void validate(){
if (isSkipSecondGroupingStep()) {
validateSkipSecondGroupingStep(withinGroupSortSpec, groupSortSpec);
}

// when group.format=grouped then, validate group.offset
// for group.main=true and group.format=simple, start value is used instead of group.offset
// and start is already validate above for negative values
if (!(main || responseFormat == Grouping.Format.simple) &&
withinGroupSortSpec.getOffset() < 0) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "'group.offset' parameter cannot be negative");
}
}

private void validateSkipSecondGroupingStep(final SortSpec withinGroupSpecification, final SortSpec groupSort) {
// Only possible if we only want one doc per group
final int limit = withinGroupSpecification.getCount();
final int offset = withinGroupSpecification.getOffset();
if (limit != 1) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND + " does not support " +
GroupParams.GROUP_LIMIT + " != 1 ("+GroupParams.GROUP_LIMIT+" is "+limit+")");
}
if (offset != 0){
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND + " does not support " + GroupParams.GROUP_OFFSET + " != 0 (" +
GroupParams.GROUP_OFFSET + " is "+offset + ")");
}

final SortField[] withinGroupSortFields = withinGroupSpecification.getSort().getSort();
final SortField[] groupSortFields = groupSort.getSort().getSort();

// Within group sort must be the same as group sort because if we skip second step no sorting within group will be done.
// This checks if withinGroupSortFields is a prefix of groupSortFields
if (Collections.indexOfSubList(Arrays.asList(groupSortFields), Arrays.asList(withinGroupSortFields)) != 0) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
GroupParams.GROUP_SKIP_DISTRIBUTED_SECOND + " does not allow the given within/global sort group configuration");
}
}

public String[] getFields() {
return fields;
Expand Down Expand Up @@ -129,4 +185,8 @@ public void setWithinGroupSortSpec(SortSpec withinGroupSortSpec) {
this.withinGroupSortSpec = withinGroupSortSpec;
}

public boolean isSkipSecondGroupingStep() { return skipSecondGroupingStep; }

public void setSkipSecondGroupingStep(boolean skipSecondGroupingStep) { this.skipSecondGroupingStep = skipSecondGroupingStep; }

}
Expand Up @@ -53,6 +53,8 @@ static Collection<SearchGroup<BytesRef>> fromMutable(SchemaField field, Collecti
for (SearchGroup<MutableValue> original : values) {
SearchGroup<BytesRef> converted = new SearchGroup<>();
converted.sortValues = original.sortValues;
converted.topDocLuceneId = original.topDocLuceneId;
converted.topDocScore = original.topDocScore;
if (original.groupValue.exists) {
BytesRefBuilder binary = new BytesRefBuilder();
fieldType.readableToIndexed(Utils.OBJECT_TO_STRING.apply(original.groupValue.toObject()), binary);
Expand Down
@@ -0,0 +1,116 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.grouping.distributed.responseprocessor;

import java.util.Collection;
import java.util.HashMap;
import java.util.Map;

import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.SearchGroup;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.handler.component.ResponseBuilder;
import org.apache.solr.handler.component.ShardDoc;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.handler.component.ShardResponse;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.grouping.GroupingSpecification;
import org.apache.solr.search.grouping.distributed.shardresultserializer.SearchGroupsResultTransformer;

public class SkipSecondStepSearchGroupShardResponseProcessor extends SearchGroupShardResponseProcessor {

@Override
protected SearchGroupsResultTransformer newSearchGroupsResultTransformer(SolrIndexSearcher solrIndexSearcher) {
return new SearchGroupsResultTransformer.SkipSecondStepSearchResultResultTransformer(solrIndexSearcher);
}

@Override
protected SearchGroupsContainer newSearchGroupsContainer(ResponseBuilder rb) {
return new SkipSecondStepSearchGroupsContainer(rb.getGroupingSpec().getFields());
}

@Override
public void process(ResponseBuilder rb, ShardRequest shardRequest) {
super.process(rb, shardRequest);
TopGroupsShardResponseProcessor.fillResultIds(rb);
}

protected static class SkipSecondStepSearchGroupsContainer extends SearchGroupsContainer {

private final Map<Object, String> docIdToShard = new HashMap<>();

public SkipSecondStepSearchGroupsContainer(String[] fields) {
super(fields);
}

@Override
public void addSearchGroups(ShardResponse srsp, String field, Collection<SearchGroup<BytesRef>> searchGroups) {
super.addSearchGroups(srsp, field, searchGroups);
for (SearchGroup<BytesRef> searchGroup : searchGroups) {
assert(srsp.getShard() != null);
docIdToShard.put(searchGroup.topDocSolrId, srsp.getShard());
}
}

@Override
public void addMergedSearchGroups(ResponseBuilder rb, String groupField, Collection<SearchGroup<BytesRef>> mergedTopGroups ) {
// TODO: add comment or javadoc re: why this method is overridden as a no-op
}

@Override
public void addSearchGroupToShards(ResponseBuilder rb, String groupField, Collection<SearchGroup<BytesRef>> mergedTopGroups) {
super.addSearchGroupToShards(rb, groupField, mergedTopGroups);

final GroupingSpecification groupingSpecification = rb.getGroupingSpec();
final Sort groupSort = groupingSpecification.getGroupSortSpec().getSort();

GroupDocs<BytesRef>[] groups = new GroupDocs[mergedTopGroups.size()];

// This is the max score found in any document on any group
float maxScore = 0;
int index = 0;

for (SearchGroup<BytesRef> group : mergedTopGroups) {
maxScore = Math.max(maxScore, group.topDocScore);
final String shard = docIdToShard.get(group.topDocSolrId);
assert(shard != null);
final ShardDoc sdoc = new ShardDoc();
sdoc.score = group.topDocScore;
sdoc.id = group.topDocSolrId;
sdoc.shard = shard;

groups[index++] = new GroupDocs<>(group.topDocScore,
group.topDocScore,
new TotalHits(1, TotalHits.Relation.EQUAL_TO), /* we don't know the actual number of hits in the group- we set it to 1 as we only keep track of the top doc */
new ShardDoc[] { sdoc }, /* only top doc */
group.groupValue,
group.sortValues);
}
TopGroups<BytesRef> topMergedGroups = new TopGroups<BytesRef>(groupSort.getSort(),
rb.getGroupingSpec().getWithinGroupSortSpec().getSort().getSort(),
0, /*Set totalHitCount to 0 as we can't computed it as is */
0, /*Set totalGroupedHitCount to 0 as we can't computed it as is*/
groups,
maxScore);
rb.mergedTopGroups.put(groupField, topMergedGroups);
}
}

}

0 comments on commit 370b51b

Please sign in to comment.