Skip to content

Commit

Permalink
Speed counting filters/range/date_histogram aggs (#81322)
Browse files Browse the repository at this point in the history
Lucene has a `Weight#count` method which will return a count of matching
documents if it can do so in constant time. We have similar code in the
`filters` aggregator and it seems silly to maintain two copies of the
same thing. Better yet, Lucene knows how to take the constant time count
path in a few cases that aggs don't and we get to pick those up. Namely,
we can now get a constant time count for `match_all` even when there are
deleted documents or document level security. We can also go the
constant time count path on cached queries now which is a lovel speed
bump.

Co-authored-by: Adrien Grand <jpountz@gmail.com> (seriously, he really rescued this thing)
  • Loading branch information
nik9000 committed Jun 8, 2022
1 parent 8a68649 commit 51d1a33
Show file tree
Hide file tree
Showing 13 changed files with 259 additions and 221 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/81322.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 81322
summary: Speed counting filters/range/date_histogram aggs
area: Aggregations
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ public void testFilterByFilter() throws InterruptedException, IOException {
"filters",
matchesList().item(
matchesMap().entry("query", "*:*")
.entry("results_from_metadata", 0)
.entry("segments_counted_in_constant_time", 0)
.entry("specialized_for", "match_all")
)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
return in.explain(context, doc);
}

@Override
public int count(LeafReaderContext context) throws IOException {
shardKeyMap.add(context.reader());
return in.count(context);
}

@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
shardKeyMap.add(context.reader());
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Bits;

import java.io.IOException;
import java.util.function.BiConsumer;
Expand All @@ -22,8 +21,6 @@
* Filter that matches every document.
*/
class MatchAllQueryToFilterAdapter extends QueryToFilterAdapter<MatchAllDocsQuery> {
private int resultsFromMetadata;

MatchAllQueryToFilterAdapter(IndexSearcher searcher, String key, MatchAllDocsQuery query) {
super(searcher, key, query);
}
Expand All @@ -38,19 +35,9 @@ IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException {
return l -> true;
}

@Override
long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException {
if (countCanUseMetadata(counter, live)) {
resultsFromMetadata++;
return ctx.reader().maxDoc(); // TODO we could use numDocs even if live is not null because provides accurate numDocs.
}
return super.count(ctx, counter, live);
}

@Override
void collectDebugInfo(BiConsumer<String, Object> add) {
super.collectDebugInfo(add);
add.accept("specialized_for", "match_all");
add.accept("results_from_metadata", resultsFromMetadata);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchNoDocsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Bits;

import java.io.IOException;
import java.util.function.BiConsumer;
Expand All @@ -36,11 +35,6 @@ IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException {
return l -> false;
}

@Override
long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException {
return 0;
}

@Override
void collectDebugInfo(BiConsumer<String, Object> add) {
super.collectDebugInfo(add);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,19 @@ public boolean isCacheable(LeafReaderContext ctx) {
return true;
}

@Override
public int count(LeafReaderContext context) throws IOException {
PointValues points = context.reader().getPointValues(field);
if (points == null) {
return 0;
}
if (points.size() == points.getDocCount()) {
// Each doc that has points has exactly one point.
return singleValuedSegmentWeight().count(context);
}
return multiValuedSegmentWeight().count(context);
}

@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
ScorerSupplier scorerSupplier = scorerSupplier(context);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,12 @@

package org.elasticsearch.search.aggregations.bucket.filter;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.sandbox.search.IndexSortSortedNumericDocValuesRangeQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.DocValuesFieldExistsQuery;
import org.apache.lucene.search.IndexOrDocValuesQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafCollector;
Expand All @@ -24,7 +22,6 @@
import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Bits;
import org.elasticsearch.common.io.stream.StreamOutput;
Expand Down Expand Up @@ -60,12 +57,6 @@ public static QueryToFilterAdapter<?> build(IndexSearcher searcher, String key,
*/
query = ((ConstantScoreQuery) query).getQuery();
}
if (query instanceof TermQuery) {
return new TermQueryToFilterAdapter(searcher, key, (TermQuery) query);
}
if (query instanceof DocValuesFieldExistsQuery) {
return new DocValuesFieldExistsAdapter(searcher, key, (DocValuesFieldExistsQuery) query);
}
if (query instanceof MatchAllDocsQuery) {
return new MatchAllQueryToFilterAdapter(searcher, key, (MatchAllDocsQuery) query);
}
Expand All @@ -83,6 +74,7 @@ public static QueryToFilterAdapter<?> build(IndexSearcher searcher, String key,
* {@link #weight()} to build it when needed.
*/
private Weight weight;
protected int segmentsCountedInConstantTime;

QueryToFilterAdapter(IndexSearcher searcher, String key, Q query) {
this.searcher = searcher;
Expand Down Expand Up @@ -124,31 +116,6 @@ protected final IndexSearcher searcher() {
return searcher;
}

/**
* Would using index metadata like {@link IndexReader#docFreq}
* or {@link IndexReader#maxDoc} to count the number of matching documents
* produce the same answer as collecting the results with a sequence like
* {@code searcher.collect(counter); return counter.readAndReset();}?
*/
protected static boolean countCanUseMetadata(FiltersAggregator.Counter counter, Bits live) {
if (live != null) {
/*
* We can only use metadata if all of the documents in the reader
* are visible. This is done by returning a null `live` bits. The
* name `live` is traditional because most of the time a non-null
* `live` bits means that there are deleted documents. But `live`
* might also be non-null if document level security is enabled.
*/
return false;
}
/*
* We can only use metadata if we're not using the special docCount
* field. Otherwise we wouldn't know how many documents each lucene
* document represents.
*/
return counter.docCount.alwaysOne();
}

/**
* Make a filter that matches this filter and the provided query.
* <p>
Expand Down Expand Up @@ -223,6 +190,24 @@ IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException {
* Count the number of documents that match this filter in a leaf.
*/
long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException {
/*
* weight().count will return the count of matches for ctx if it can do
* so in constant time, otherwise -1. The Weight is responsible for
* all of the cases where it can't return an accurate count *except*
* the doc_count field. That thing is ours, not Lucene's.
*
* For example, TermQuery will return -1 if there are deleted docs,
* otherwise it'll return number of documents with the term from the
* term statistics. MatchAllDocs will call `ctx.reader().numdocs()`
* to get the number of live docs.
*/
if (counter.docCount.alwaysOne()) {
int count = weight().count(ctx);
if (count != -1) {
segmentsCountedInConstantTime++;
return count;
}
}
BulkScorer scorer = weight().bulkScorer(ctx);
if (scorer == null) {
// No hits in this segment.
Expand Down Expand Up @@ -257,6 +242,7 @@ void collect(LeafReaderContext ctx, LeafCollector collector, Bits live) throws I
*/
void collectDebugInfo(BiConsumer<String, Object> add) {
add.accept("query", query.toString());
add.accept("segments_counted_in_constant_time", segmentsCountedInConstantTime);
}

private Weight weight() throws IOException {
Expand Down

This file was deleted.

0 comments on commit 51d1a33

Please sign in to comment.