Speed counting filters/range/date_histogram aggs (#81322)

Lucene has a `Weight#count` method which will return a count of matching documents if it can do so in constant time. We have similar code in the `filters` aggregator and it seems silly to maintain two copies of the same thing. Better yet, Lucene knows how to take the constant time count path in a few cases that aggs don't and we get to pick those up. Namely, we can now get a constant time count for `match_all` even when there are deleted documents or document level security. We can also go the constant time count path on cached queries now which is a lovel speed bump. Co-authored-by: Adrien Grand <jpountz@gmail.com> (seriously, he really rescued this thing)
elastic · Jun 8, 2022 · 51d1a33 · 51d1a33
1 parent 8a68649
commit 51d1a33
Show file tree

Hide file tree

Showing 13 changed files with 259 additions and 221 deletions.
diff --git a/docs/changelog/81322.yaml b/docs/changelog/81322.yaml
@@ -0,0 +1,5 @@
+pr: 81322
+summary: Speed counting filters/range/date_histogram aggs
+area: Aggregations
+type: enhancement
+issues: []
diff --git a/...lClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java b/...lClusterTest/java/org/elasticsearch/search/profile/aggregation/AggregationProfilerIT.java
@@ -698,7 +698,7 @@ public void testFilterByFilter() throws InterruptedException, IOException {
                                         "filters",
                                         matchesList().item(
                                             matchesMap().entry("query", "*:*")
-                                                .entry("results_from_metadata", 0)
+                                                .entry("segments_counted_in_constant_time", 0)
                                                 .entry("specialized_for", "match_all")
                                         )
                                     )

diff --git a/server/src/main/java/org/elasticsearch/indices/IndicesQueryCache.java b/server/src/main/java/org/elasticsearch/indices/IndicesQueryCache.java
@@ -141,6 +141,12 @@ public Explanation explain(LeafReaderContext context, int doc) throws IOExceptio
             return in.explain(context, doc);
         }
 
+        @Override
+        public int count(LeafReaderContext context) throws IOException {
+            shardKeyMap.add(context.reader());
+            return in.count(context);
+        }
+
         @Override
         public Scorer scorer(LeafReaderContext context) throws IOException {
             shardKeyMap.add(context.reader());

diff --git a/...java/org/elasticsearch/search/aggregations/bucket/filter/DocValuesFieldExistsAdapter.java b/...java/org/elasticsearch/search/aggregations/bucket/filter/DocValuesFieldExistsAdapter.java
diff --git a/...ava/org/elasticsearch/search/aggregations/bucket/filter/MatchAllQueryToFilterAdapter.java b/...ava/org/elasticsearch/search/aggregations/bucket/filter/MatchAllQueryToFilterAdapter.java
@@ -12,7 +12,6 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.util.Bits;
 
 import java.io.IOException;
 import java.util.function.BiConsumer;
@@ -22,8 +21,6 @@
  * Filter that matches every document.
  */
 class MatchAllQueryToFilterAdapter extends QueryToFilterAdapter<MatchAllDocsQuery> {
-    private int resultsFromMetadata;
-
     MatchAllQueryToFilterAdapter(IndexSearcher searcher, String key, MatchAllDocsQuery query) {
         super(searcher, key, query);
     }
@@ -38,19 +35,9 @@ IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException {
         return l -> true;
     }
 
-    @Override
-    long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException {
-        if (countCanUseMetadata(counter, live)) {
-            resultsFromMetadata++;
-            return ctx.reader().maxDoc();  // TODO we could use numDocs even if live is not null because provides accurate numDocs.
-        }
-        return super.count(ctx, counter, live);
-    }
-
     @Override
     void collectDebugInfo(BiConsumer<String, Object> add) {
         super.collectDebugInfo(add);
         add.accept("specialized_for", "match_all");
-        add.accept("results_from_metadata", resultsFromMetadata);
     }
 }
diff --git a/...va/org/elasticsearch/search/aggregations/bucket/filter/MatchNoneQueryToFilterAdapter.java b/...va/org/elasticsearch/search/aggregations/bucket/filter/MatchNoneQueryToFilterAdapter.java
@@ -12,7 +12,6 @@
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.Query;
-import org.apache.lucene.util.Bits;
 
 import java.io.IOException;
 import java.util.function.BiConsumer;
@@ -36,11 +35,6 @@ IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException {
         return l -> false;
     }
 
-    @Override
-    long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException {
-        return 0;
-    }
-
     @Override
     void collectDebugInfo(BiConsumer<String, Object> add) {
         super.collectDebugInfo(add);

diff --git a/.../main/java/org/elasticsearch/search/aggregations/bucket/filter/MergedPointRangeQuery.java b/.../main/java/org/elasticsearch/search/aggregations/bucket/filter/MergedPointRangeQuery.java
@@ -103,6 +103,19 @@ public boolean isCacheable(LeafReaderContext ctx) {
                 return true;
             }
 
+            @Override
+            public int count(LeafReaderContext context) throws IOException {
+                PointValues points = context.reader().getPointValues(field);
+                if (points == null) {
+                    return 0;
+                }
+                if (points.size() == points.getDocCount()) {
+                    // Each doc that has points has exactly one point.
+                    return singleValuedSegmentWeight().count(context);
+                }
+                return multiValuedSegmentWeight().count(context);
+            }
+
             @Override
             public Scorer scorer(LeafReaderContext context) throws IOException {
                 ScorerSupplier scorerSupplier = scorerSupplier(context);

diff --git a/...c/main/java/org/elasticsearch/search/aggregations/bucket/filter/QueryToFilterAdapter.java b/...c/main/java/org/elasticsearch/search/aggregations/bucket/filter/QueryToFilterAdapter.java
@@ -8,14 +8,12 @@
 
 package org.elasticsearch.search.aggregations.bucket.filter;
 
-import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.sandbox.search.IndexSortSortedNumericDocValuesRangeQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BulkScorer;
 import org.apache.lucene.search.ConstantScoreQuery;
-import org.apache.lucene.search.DocValuesFieldExistsQuery;
 import org.apache.lucene.search.IndexOrDocValuesQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.LeafCollector;
@@ -24,7 +22,6 @@
 import org.apache.lucene.search.PointRangeQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreMode;
-import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.Weight;
 import org.apache.lucene.util.Bits;
 import org.elasticsearch.common.io.stream.StreamOutput;
@@ -60,12 +57,6 @@ public static QueryToFilterAdapter<?> build(IndexSearcher searcher, String key,
              */
             query = ((ConstantScoreQuery) query).getQuery();
         }
-        if (query instanceof TermQuery) {
-            return new TermQueryToFilterAdapter(searcher, key, (TermQuery) query);
-        }
-        if (query instanceof DocValuesFieldExistsQuery) {
-            return new DocValuesFieldExistsAdapter(searcher, key, (DocValuesFieldExistsQuery) query);
-        }
         if (query instanceof MatchAllDocsQuery) {
             return new MatchAllQueryToFilterAdapter(searcher, key, (MatchAllDocsQuery) query);
         }
@@ -83,6 +74,7 @@ public static QueryToFilterAdapter<?> build(IndexSearcher searcher, String key,
      * {@link #weight()} to build it when needed.
      */
     private Weight weight;
+    protected int segmentsCountedInConstantTime;
 
     QueryToFilterAdapter(IndexSearcher searcher, String key, Q query) {
         this.searcher = searcher;
@@ -124,31 +116,6 @@ protected final IndexSearcher searcher() {
         return searcher;
     }
 
-    /**
-     * Would using index metadata like {@link IndexReader#docFreq}
-     * or {@link IndexReader#maxDoc} to count the number of matching documents
-     * produce the same answer as collecting the results with a sequence like
-     * {@code searcher.collect(counter); return counter.readAndReset();}?
-     */
-    protected static boolean countCanUseMetadata(FiltersAggregator.Counter counter, Bits live) {
-        if (live != null) {
-            /*
-             * We can only use metadata if all of the documents in the reader
-             * are visible. This is done by returning a null `live` bits. The
-             * name `live` is traditional because most of the time a non-null
-             * `live` bits means that there are deleted documents. But `live`
-             * might also be non-null if document level security is enabled.
-             */
-            return false;
-        }
-        /*
-         * We can only use metadata if we're not using the special docCount
-         * field. Otherwise we wouldn't know how many documents each lucene
-         * document represents.
-         */
-        return counter.docCount.alwaysOne();
-    }
-
     /**
      * Make a filter that matches this filter and the provided query.
      * <p>
@@ -223,6 +190,24 @@ IntPredicate matchingDocIds(LeafReaderContext ctx) throws IOException {
      * Count the number of documents that match this filter in a leaf.
      */
     long count(LeafReaderContext ctx, FiltersAggregator.Counter counter, Bits live) throws IOException {
+        /*
+         * weight().count will return the count of matches for ctx if it can do
+         * so in constant time, otherwise -1. The Weight is responsible for
+         * all of the cases where it can't return an accurate count *except*
+         * the doc_count field. That thing is ours, not Lucene's.
+         *
+         * For example, TermQuery will return -1 if there are deleted docs,
+         * otherwise it'll return number of documents with the term from the
+         * term statistics. MatchAllDocs will call `ctx.reader().numdocs()`
+         * to get the number of live docs.
+         */
+        if (counter.docCount.alwaysOne()) {
+            int count = weight().count(ctx);
+            if (count != -1) {
+                segmentsCountedInConstantTime++;
+                return count;
+            }
+        }
         BulkScorer scorer = weight().bulkScorer(ctx);
         if (scorer == null) {
             // No hits in this segment.
@@ -257,6 +242,7 @@ void collect(LeafReaderContext ctx, LeafCollector collector, Bits live) throws I
      */
     void collectDebugInfo(BiConsumer<String, Object> add) {
         add.accept("query", query.toString());
+        add.accept("segments_counted_in_constant_time", segmentsCountedInConstantTime);
     }
 
     private Weight weight() throws IOException {

diff --git a/...in/java/org/elasticsearch/search/aggregations/bucket/filter/TermQueryToFilterAdapter.java b/...in/java/org/elasticsearch/search/aggregations/bucket/filter/TermQueryToFilterAdapter.java