From 238e405455354bfc226c1c4e1ad3b902739fdbfb Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 23 Oct 2014 19:19:36 +0200 Subject: [PATCH] Aggregations: Return the sum of the doc counts of other buckets. This commit adds a new field to the response of the terms aggregation called `sum_other_doc_count` which is equal to the sum of the doc counts of the buckets that did not make it to the list of top buckets. It is typically useful to have a sector called eg. `other` when using terms aggregations to build pie charts. Example query and response: ```json GET test/_search?search_type=count { "aggs": { "colors": { "terms": { "field": "color", "size": 3 } } } } ``` ```json { [...], "aggregations": { "colors": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 4, "buckets": [ { "key": "blue", "doc_count": 65 }, { "key": "red", "doc_count": 14 }, { "key": "brown", "doc_count": 3 } ] } } } ``` Close #8213 --- .../bucket/terms-aggregation.asciidoc | 10 ++- .../terms/AbstractStringTermsAggregator.java | 2 +- .../bucket/terms/DoubleTerms.java | 15 +++- .../bucket/terms/DoubleTermsAggregator.java | 2 +- .../GlobalOrdinalsStringTermsAggregator.java | 5 +- .../bucket/terms/InternalTerms.java | 21 ++++- .../aggregations/bucket/terms/LongTerms.java | 15 +++- .../bucket/terms/LongTermsAggregator.java | 7 +- .../bucket/terms/StringTerms.java | 15 +++- .../bucket/terms/StringTermsAggregator.java | 10 +-- .../aggregations/bucket/terms/Terms.java | 6 ++ .../bucket/terms/UnmappedTerms.java | 5 +- .../bucket/AbstractTermsTests.java | 82 +++++++++++++++++++ .../aggregations/bucket/DoubleTermsTests.java | 6 +- .../aggregations/bucket/LongTermsTests.java | 6 +- .../aggregations/bucket/MinDocCountTests.java | 6 +- ...ignificantTermsSignificanceScoreTests.java | 4 +- .../aggregations/bucket/StringTermsTests.java | 13 +-- 18 files changed, 187 insertions(+), 43 deletions(-) create mode 100644 src/test/java/org/elasticsearch/search/aggregations/bucket/AbstractTermsTests.java diff --git a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc index f828f8884af2c..96d43be4c88a8 100644 --- a/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc +++ b/docs/reference/search/aggregations/bucket/terms-aggregation.asciidoc @@ -25,7 +25,9 @@ Response: "aggregations" : { "genders" : { - "buckets" : [ + "doc_count_error_upper_bound": 0, <1> + "sum_other_doc_count": 0, <2> + "buckets" : [ <3> { "key" : "male", "doc_count" : 10 @@ -40,6 +42,10 @@ Response: } -------------------------------------------------- +<1> an upper bound of the error on the document counts for each term, see <> +<2> when there are lots of unique terms, elasticsearch only returns the top terms; this number is the sum of the document counts for all buckets that are not part of the response +<3> the list of the top buckets, the meaning of `top` being defined by the <> + By default, the `terms` aggregation will return the buckets for the top ten terms ordered by the `doc_count`. One can change this default behaviour by setting the `size` parameter. @@ -52,6 +58,7 @@ This means that if the number of unique terms is greater than `size`, the return (it could be that the term counts are slightly off and it could even be that a term that should have been in the top size buckets was not returned). If set to `0`, the `size` will be set to `Integer.MAX_VALUE`. +[[search-aggregations-bucket-terms-aggregation-approximate-counts]] ==== Document counts are approximate As described above, the document counts (and the results of any sub aggregations) in the terms aggregation are not always @@ -226,6 +233,7 @@ does not return a particular term which appears in the results from another shar aggregation is either sorted by a sub aggregation or in order of ascending document count, the error in the document counts cannot be determined and is given a value of -1 to indicate this. +[[search-aggregations-bucket-terms-aggregation-order]] ==== Order The order of the buckets can be customized by setting the `order` parameter. By default, the buckets are ordered by diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java index 571f2627a7a35..3ea1bad365f58 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/AbstractStringTermsAggregator.java @@ -45,7 +45,7 @@ public boolean shouldCollect() { @Override public InternalAggregation buildEmptyAggregation() { - return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Collections.emptyList(), showTermDocCountError, 0); + return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Collections.emptyList(), showTermDocCountError, 0, 0); } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTerms.java index be8313e6cd769..6034970a774e0 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTerms.java @@ -98,8 +98,8 @@ Bucket newBucket(long docCount, InternalAggregations aggs, long docCountError) { DoubleTerms() {} // for serialization - public DoubleTerms(String name, InternalOrder order, @Nullable ValueFormatter formatter, int requiredSize, int shardSize, long minDocCount, List buckets, boolean showTermDocCountError, long docCountError) { - super(name, order, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError); + public DoubleTerms(String name, InternalOrder order, @Nullable ValueFormatter formatter, int requiredSize, int shardSize, long minDocCount, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount) { + super(name, order, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError, otherDocCount); this.formatter = formatter; } @@ -109,8 +109,8 @@ public Type type() { } @Override - protected InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError) { - return new DoubleTerms(name, order, formatter, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError); + protected InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount) { + return new DoubleTerms(name, order, formatter, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError, otherDocCount); } @Override @@ -132,6 +132,9 @@ public void readFrom(StreamInput in) throws IOException { this.showTermDocCountError = false; } this.minDocCount = in.readVLong(); + if (in.getVersion().onOrAfter(Version.V_1_4_0)) { + this.otherDocCount = in.readVLong(); + } int size = in.readVInt(); List buckets = new ArrayList<>(size); for (int i = 0; i < size; i++) { @@ -162,6 +165,9 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(showTermDocCountError); } out.writeVLong(minDocCount); + if (out.getVersion().onOrAfter(Version.V_1_4_0)) { + out.writeVLong(otherDocCount); + } out.writeVInt(buckets.size()); for (InternalTerms.Bucket bucket : buckets) { out.writeDouble(((Bucket) bucket).term); @@ -176,6 +182,7 @@ public void writeTo(StreamOutput out) throws IOException { @Override public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { builder.field(InternalTerms.DOC_COUNT_ERROR_UPPER_BOUND_FIELD_NAME, docCountError); + builder.field(SUM_OF_OTHER_DOC_COUNTS, otherDocCount); builder.startArray(CommonFields.BUCKETS); for (InternalTerms.Bucket bucket : buckets) { builder.startObject(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTermsAggregator.java index 94cdcb30f0745..9e106df8f55e0 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/DoubleTermsAggregator.java @@ -69,7 +69,7 @@ private static DoubleTerms convertToDouble(LongTerms terms) { for (int i = 0; i < buckets.length; ++i) { buckets[i] = convertToDouble(buckets[i]); } - return new DoubleTerms(terms.getName(), terms.order, terms.formatter, terms.requiredSize, terms.shardSize, terms.minDocCount, Arrays.asList(buckets), terms.showTermDocCountError, terms.docCountError); + return new DoubleTerms(terms.getName(), terms.order, terms.formatter, terms.requiredSize, terms.shardSize, terms.minDocCount, Arrays.asList(buckets), terms.showTermDocCountError, terms.docCountError, terms.otherDocCount); } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 5a06ee835c0c9..c5a2340f0a9ff 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -150,6 +150,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { } else { size = (int) Math.min(maxBucketOrd(), bucketCountThresholds.getShardSize()); } + long otherDocCount = 0; BucketPriorityQueue ordered = new BucketPriorityQueue(size, order.comparator(this)); OrdBucket spare = new OrdBucket(-1, 0, null, showTermDocCountError, 0); for (long globalTermOrd = 0; globalTermOrd < globalOrds.getValueCount(); ++globalTermOrd) { @@ -161,6 +162,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { if (bucketCountThresholds.getMinDocCount() > 0 && bucketDocCount == 0) { continue; } + otherDocCount += bucketDocCount; spare.globalOrd = globalTermOrd; spare.bucketOrd = bucketOrd; spare.docCount = bucketDocCount; @@ -182,6 +184,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { copy(globalOrds.lookupOrd(bucket.globalOrd), scratch); list[i] = new StringTerms.Bucket(scratch, bucket.docCount, null, showTermDocCountError, 0); list[i].bucketOrd = bucket.bucketOrd; + otherDocCount -= list[i].docCount; } //replay any deferred collections runDeferredCollections(survivingBucketOrds); @@ -193,7 +196,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { bucket.docCountError = 0; } - return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Arrays.asList(list), showTermDocCountError, 0); + return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Arrays.asList(list), showTermDocCountError, 0, otherDocCount); } /** This is used internally only, just for compare using global ordinal instead of term bytes in the PQ */ diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java index af57d89fc41c8..df706de3d0e02 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/InternalTerms.java @@ -37,6 +37,7 @@ public abstract class InternalTerms extends InternalAggregation implements Terms, ToXContent, Streamable { protected static final String DOC_COUNT_ERROR_UPPER_BOUND_FIELD_NAME = "doc_count_error_upper_bound"; + protected static final String SUM_OF_OTHER_DOC_COUNTS = "sum_other_doc_count"; public static abstract class Bucket extends Terms.Bucket { @@ -104,10 +105,11 @@ public Bucket reduce(List buckets, ReduceContext context) { protected Map bucketMap; protected long docCountError; protected boolean showTermDocCountError; + protected long otherDocCount; protected InternalTerms() {} // for serialization - protected InternalTerms(String name, InternalOrder order, int requiredSize, int shardSize, long minDocCount, List buckets, boolean showTermDocCountError, long docCountError) { + protected InternalTerms(String name, InternalOrder order, int requiredSize, int shardSize, long minDocCount, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount) { super(name); this.order = order; this.requiredSize = requiredSize; @@ -116,6 +118,7 @@ protected InternalTerms(String name, InternalOrder order, int requiredSize, int this.buckets = buckets; this.showTermDocCountError = showTermDocCountError; this.docCountError = docCountError; + this.otherDocCount = otherDocCount; } @Override @@ -139,14 +142,21 @@ public long getDocCountError() { return docCountError; } + @Override + public long getSumOfOtherDocCounts() { + return otherDocCount; + } + @Override public InternalAggregation reduce(ReduceContext reduceContext) { List aggregations = reduceContext.aggregations(); Multimap buckets = ArrayListMultimap.create(); long sumDocCountError = 0; + long otherDocCount = 0; for (InternalAggregation aggregation : aggregations) { InternalTerms terms = (InternalTerms) aggregation; + otherDocCount += terms.getSumOfOtherDocCounts(); final long thisAggDocCountError; if (terms.buckets.size() < this.shardSize || this.order == InternalOrder.TERM_ASC || this.order == InternalOrder.TERM_DESC) { thisAggDocCountError = 0; @@ -182,7 +192,10 @@ public InternalAggregation reduce(ReduceContext reduceContext) { } } if (b.docCount >= minDocCount) { - ordered.insertWithOverflow(b); + Terms.Bucket removed = ordered.insertWithOverflow(b); + if (removed != null) { + otherDocCount += removed.getDocCount(); + } } } Bucket[] list = new Bucket[ordered.size()]; @@ -195,9 +208,9 @@ public InternalAggregation reduce(ReduceContext reduceContext) { } else { docCountError = aggregations.size() == 1 ? 0 : sumDocCountError; } - return newAggregation(name, Arrays.asList(list), showTermDocCountError, docCountError); + return newAggregation(name, Arrays.asList(list), showTermDocCountError, docCountError, otherDocCount); } - protected abstract InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError); + protected abstract InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTerms.java index 0b05bd91b089a..6a47a577cc634 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTerms.java @@ -99,8 +99,8 @@ Bucket newBucket(long docCount, InternalAggregations aggs, long docCountError) { LongTerms() {} // for serialization - public LongTerms(String name, InternalOrder order, @Nullable ValueFormatter formatter, int requiredSize, int shardSize, long minDocCount, List buckets, boolean showTermDocCountError, long docCountError) { - super(name, order, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError); + public LongTerms(String name, InternalOrder order, @Nullable ValueFormatter formatter, int requiredSize, int shardSize, long minDocCount, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount) { + super(name, order, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError, otherDocCount); this.formatter = formatter; } @@ -110,8 +110,8 @@ public Type type() { } @Override - protected InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError) { - return new LongTerms(name, order, formatter, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError); + protected InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount) { + return new LongTerms(name, order, formatter, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError, otherDocCount); } @Override @@ -133,6 +133,9 @@ public void readFrom(StreamInput in) throws IOException { this.showTermDocCountError = false; } this.minDocCount = in.readVLong(); + if (in.getVersion().onOrAfter(Version.V_1_4_0)) { + this.otherDocCount = in.readVLong(); + } int size = in.readVInt(); List buckets = new ArrayList<>(size); for (int i = 0; i < size; i++) { @@ -163,6 +166,9 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(showTermDocCountError); } out.writeVLong(minDocCount); + if (out.getVersion().onOrAfter(Version.V_1_4_0)) { + out.writeVLong(otherDocCount); + } out.writeVInt(buckets.size()); for (InternalTerms.Bucket bucket : buckets) { out.writeLong(((Bucket) bucket).term); @@ -177,6 +183,7 @@ public void writeTo(StreamOutput out) throws IOException { @Override public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { builder.field(InternalTerms.DOC_COUNT_ERROR_UPPER_BOUND_FIELD_NAME, docCountError); + builder.field(SUM_OF_OTHER_DOC_COUNTS, otherDocCount); builder.startArray(CommonFields.BUCKETS); for (InternalTerms.Bucket bucket : buckets) { builder.startObject(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTermsAggregator.java index cceabacd86845..c8d89940f57b6 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/LongTermsAggregator.java @@ -115,6 +115,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize()); + long otherDocCount = 0; BucketPriorityQueue ordered = new BucketPriorityQueue(size, order.comparator(this)); LongTerms.Bucket spare = null; for (long i = 0; i < bucketOrds.size(); i++) { @@ -123,6 +124,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { } spare.term = bucketOrds.get(i); spare.docCount = bucketDocCount(i); + otherDocCount += spare.docCount; spare.bucketOrd = i; if (bucketCountThresholds.getShardMinDocCount() <= spare.docCount) { spare = (LongTerms.Bucket) ordered.insertWithOverflow(spare); @@ -136,6 +138,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { final LongTerms.Bucket bucket = (LongTerms.Bucket) ordered.pop(); survivingBucketOrds[i] = bucket.bucketOrd; list[i] = bucket; + otherDocCount -= bucket.docCount; } runDeferredCollections(survivingBucketOrds); @@ -146,13 +149,13 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { list[i].docCountError = 0; } - return new LongTerms(name, order, formatter, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Arrays.asList(list), showTermDocCountError, 0); + return new LongTerms(name, order, formatter, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Arrays.asList(list), showTermDocCountError, 0, otherDocCount); } @Override public InternalAggregation buildEmptyAggregation() { - return new LongTerms(name, order, formatter, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Collections.emptyList(), showTermDocCountError, 0); + return new LongTerms(name, order, formatter, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Collections.emptyList(), showTermDocCountError, 0, 0); } @Override diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTerms.java index b046511d65e68..2999f19fc5955 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTerms.java @@ -98,8 +98,8 @@ Bucket newBucket(long docCount, InternalAggregations aggs, long docCountError) { StringTerms() {} // for serialization - public StringTerms(String name, InternalOrder order, int requiredSize, int shardSize, long minDocCount, List buckets, boolean showTermDocCountError, long docCountError) { - super(name, order, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError); + public StringTerms(String name, InternalOrder order, int requiredSize, int shardSize, long minDocCount, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount) { + super(name, order, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError, otherDocCount); } @Override @@ -108,8 +108,8 @@ public Type type() { } @Override - protected InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError) { - return new StringTerms(name, order, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError); + protected InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount) { + return new StringTerms(name, order, requiredSize, shardSize, minDocCount, buckets, showTermDocCountError, docCountError, otherDocCount); } @Override @@ -130,6 +130,9 @@ public void readFrom(StreamInput in) throws IOException { this.showTermDocCountError = false; } this.minDocCount = in.readVLong(); + if (in.getVersion().onOrAfter(Version.V_1_4_0)) { + this.otherDocCount = in.readVLong(); + } int size = in.readVInt(); List buckets = new ArrayList<>(size); for (int i = 0; i < size; i++) { @@ -159,6 +162,9 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(showTermDocCountError); } out.writeVLong(minDocCount); + if (out.getVersion().onOrAfter(Version.V_1_4_0)) { + out.writeVLong(otherDocCount); + } out.writeVInt(buckets.size()); for (InternalTerms.Bucket bucket : buckets) { out.writeBytesRef(((Bucket) bucket).termBytes); @@ -173,6 +179,7 @@ public void writeTo(StreamOutput out) throws IOException { @Override public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { builder.field(InternalTerms.DOC_COUNT_ERROR_UPPER_BOUND_FIELD_NAME, docCountError); + builder.field(SUM_OF_OTHER_DOC_COUNTS, otherDocCount); builder.startArray(CommonFields.BUCKETS); for (InternalTerms.Bucket bucket : buckets) { builder.startObject(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java index 5f6f683ae69ab..ce82a2839b158 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/StringTermsAggregator.java @@ -120,6 +120,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { final int size = (int) Math.min(bucketOrds.size(), bucketCountThresholds.getShardSize()); + long otherDocCount = 0; BucketPriorityQueue ordered = new BucketPriorityQueue(size, order.comparator(this)); StringTerms.Bucket spare = null; for (int i = 0; i < bucketOrds.size(); i++) { @@ -128,6 +129,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { } bucketOrds.get(i, spare.termBytes); spare.docCount = bucketDocCount(i); + otherDocCount += spare.docCount; spare.bucketOrd = i; if (bucketCountThresholds.getShardMinDocCount() <= spare.docCount) { spare = (StringTerms.Bucket) ordered.insertWithOverflow(spare); @@ -141,6 +143,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { final StringTerms.Bucket bucket = (StringTerms.Bucket) ordered.pop(); survivingBucketOrds[i] = bucket.bucketOrd; list[i] = bucket; + otherDocCount -= bucket.docCount; } // replay any deferred collections runDeferredCollections(survivingBucketOrds); @@ -153,12 +156,7 @@ public InternalAggregation buildAggregation(long owningBucketOrdinal) { bucket.docCountError = 0; } - return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Arrays.asList(list), showTermDocCountError, 0); - } - - @Override - public InternalAggregation buildEmptyAggregation() { - return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Collections.emptyList(), showTermDocCountError, 0); + return new StringTerms(name, order, bucketCountThresholds.getRequiredSize(), bucketCountThresholds.getShardSize(), bucketCountThresholds.getMinDocCount(), Arrays.asList(list), showTermDocCountError, 0, otherDocCount); } @Override diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/Terms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/Terms.java index 236c083f7c2c2..a3116519800d8 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/Terms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/Terms.java @@ -85,6 +85,12 @@ static abstract class Bucket implements MultiBucketsAggregation.Bucket { */ long getDocCountError(); + /** + * Return the sum of the document counts of all buckets that did not make + * it to the top buckets. + */ + long getSumOfOtherDocCounts(); + /** * Determines the order by which the term buckets will be sorted */ diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedTerms.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedTerms.java index a09faef45ccff..ef260855daf3f 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedTerms.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/UnmappedTerms.java @@ -55,7 +55,7 @@ public static void registerStreams() { UnmappedTerms() {} // for serialization public UnmappedTerms(String name, InternalOrder order, int requiredSize, int shardSize, long minDocCount) { - super(name, order, requiredSize, shardSize, minDocCount, BUCKETS, false, 0); + super(name, order, requiredSize, shardSize, minDocCount, BUCKETS, false, 0, 0); } @Override @@ -93,13 +93,14 @@ public InternalAggregation reduce(ReduceContext reduceContext) { } @Override - protected InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError) { + protected InternalTerms newAggregation(String name, List buckets, boolean showTermDocCountError, long docCountError, long otherDocCount) { throw new UnsupportedOperationException("How did you get there?"); } @Override public XContentBuilder doXContentBody(XContentBuilder builder, Params params) throws IOException { builder.field(InternalTerms.DOC_COUNT_ERROR_UPPER_BOUND_FIELD_NAME, docCountError); + builder.field(SUM_OF_OTHER_DOC_COUNTS, 0); builder.startArray(CommonFields.BUCKETS).endArray(); return builder; } diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/AbstractTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/AbstractTermsTests.java new file mode 100644 index 0000000000000..239479564664b --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/AbstractTermsTests.java @@ -0,0 +1,82 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.bucket; + +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.search.aggregations.Aggregator.SubAggCollectionMode; +import org.elasticsearch.search.aggregations.bucket.terms.Terms; +import org.elasticsearch.search.aggregations.bucket.terms.TermsAggregatorFactory.ExecutionMode; +import org.elasticsearch.test.ElasticsearchIntegrationTest; +import org.junit.Ignore; + +import static org.elasticsearch.search.aggregations.AggregationBuilders.terms; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse; + +@Ignore +public abstract class AbstractTermsTests extends ElasticsearchIntegrationTest { + + public String randomExecutionHint() { + return randomBoolean() ? null : randomFrom(ExecutionMode.values()).toString(); + } + + private static long sumOfDocCounts(Terms terms) { + long sumOfDocCounts = terms.getSumOfOtherDocCounts(); + for (Terms.Bucket b : terms.getBuckets()) { + sumOfDocCounts += b.getDocCount(); + } + return sumOfDocCounts; + } + + public void testOtherDocCount(String... fieldNames) { + for (String fieldName : fieldNames) { + SearchResponse allTerms = client().prepareSearch("idx") + .addAggregation(terms("terms") + .executionHint(randomExecutionHint()) + .field(fieldName) + .size(0) + .collectMode(randomFrom(SubAggCollectionMode.values()))) + .get(); + assertSearchResponse(allTerms); + + Terms terms = allTerms.getAggregations().get("terms"); + assertEquals(0, terms.getSumOfOtherDocCounts()); // size is 0 + final long sumOfDocCounts = sumOfDocCounts(terms); + final int totalNumTerms = terms.getBuckets().size(); + + for (int size = 1; size < totalNumTerms + 2; size += randomIntBetween(1, 5)) { + for (int shardSize = size; shardSize <= totalNumTerms + 2; shardSize += randomIntBetween(1, 5)) { + SearchResponse resp = client().prepareSearch("idx") + .addAggregation(terms("terms") + .executionHint(randomExecutionHint()) + .field(fieldName) + .size(size) + .shardSize(shardSize) + .collectMode(randomFrom(SubAggCollectionMode.values()))) + .get(); + assertSearchResponse(resp); + terms = resp.getAggregations().get("terms"); + assertEquals(Math.min(size, totalNumTerms), terms.getBuckets().size()); + assertEquals(sumOfDocCounts, sumOfDocCounts(terms)); + } + } + } + } + +} diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/DoubleTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/DoubleTermsTests.java index 886e7abf0ce05..ab239f67e0ea1 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/DoubleTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/DoubleTermsTests.java @@ -54,7 +54,7 @@ * */ @ElasticsearchIntegrationTest.SuiteScopeTest -public class DoubleTermsTests extends ElasticsearchIntegrationTest { +public class DoubleTermsTests extends AbstractTermsTests { private static final int NUM_DOCS = 5; // TODO: randomize the size? private static final String SINGLE_VALUED_FIELD_NAME = "d_value"; @@ -1046,4 +1046,8 @@ public void script_Score() { } } + @Test + public void otherDocCount() { + testOtherDocCount(SINGLE_VALUED_FIELD_NAME, MULTI_VALUED_FIELD_NAME); + } } diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/LongTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/LongTermsTests.java index a09d33f25c5b2..a2361b81a566d 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/LongTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/LongTermsTests.java @@ -52,7 +52,7 @@ * */ @ElasticsearchIntegrationTest.SuiteScopeTest -public class LongTermsTests extends ElasticsearchIntegrationTest { +public class LongTermsTests extends AbstractTermsTests { private static final int NUM_DOCS = 5; // TODO randomize the size? private static final String SINGLE_VALUED_FIELD_NAME = "l_value"; @@ -1020,4 +1020,8 @@ public void singleValuedField_OrderedByMultiValueExtendedStatsAsc() throws Excep } + @Test + public void otherDocCount() { + testOtherDocCount(SINGLE_VALUED_FIELD_NAME, MULTI_VALUED_FIELD_NAME); + } } diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/MinDocCountTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/MinDocCountTests.java index 817dcbc70f154..8dc40a2527998 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/MinDocCountTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/MinDocCountTests.java @@ -49,7 +49,7 @@ @ElasticsearchIntegrationTest.SuiteScopeTest @TestLogging("action.admin.indices.refresh:TRACE,action.search.type:TRACE,cluster.service:TRACE") -public class MinDocCountTests extends ElasticsearchIntegrationTest { +public class MinDocCountTests extends AbstractTermsTests { private static final QueryBuilder QUERY = QueryBuilders.termQuery("match", true); @@ -275,7 +275,7 @@ private void testMinDocCountOnTerms(String field, Script script, Terms.Order ord .setQuery(QUERY) .addAggregation(script.apply(terms("terms"), field) .collectMode(randomFrom(SubAggCollectionMode.values())) - .executionHint(StringTermsTests.randomExecutionHint()) + .executionHint(randomExecutionHint()) .order(order) .size(cardinality + randomInt(10)) .minDocCount(0)) @@ -292,7 +292,7 @@ private void testMinDocCountOnTerms(String field, Script script, Terms.Order ord .setQuery(QUERY) .addAggregation(script.apply(terms("terms"), field) .collectMode(randomFrom(SubAggCollectionMode.values())) - .executionHint(StringTermsTests.randomExecutionHint()) + .executionHint(randomExecutionHint()) .order(order) .size(size) .include(include) diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreTests.java index 17dedfab20da6..525fde3d800da 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/SignificantTermsSignificanceScoreTests.java @@ -255,9 +255,9 @@ public void testXContentResponse() throws Exception { classes.toXContent(responseBuilder, null); String result = null; if (type.equals("long")) { - result = "\"class\"{\"doc_count_error_upper_bound\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":0,\"key_as_string\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":1,\"key_as_string\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}"; + result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":0,\"key_as_string\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":1,\"key_as_string\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}"; } else { - result = "\"class\"{\"doc_count_error_upper_bound\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}"; + result = "\"class\"{\"doc_count_error_upper_bound\":0,\"sum_other_doc_count\":0,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"sig_terms\":{\"doc_count\":4,\"buckets\":[{\"key\":\"0\",\"doc_count\":4,\"score\":0.39999999999999997,\"bg_count\":5}]}},{\"key\":\"1\",\"doc_count\":3,\"sig_terms\":{\"doc_count\":3,\"buckets\":[{\"key\":\"1\",\"doc_count\":3,\"score\":0.75,\"bg_count\":4}]}}]}"; } assertThat(responseBuilder.string(), equalTo(result)); diff --git a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java index 137181434b607..71af6097d3226 100644 --- a/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java +++ b/src/test/java/org/elasticsearch/search/aggregations/bucket/StringTermsTests.java @@ -56,15 +56,11 @@ * */ @ElasticsearchIntegrationTest.SuiteScopeTest -public class StringTermsTests extends ElasticsearchIntegrationTest { +public class StringTermsTests extends AbstractTermsTests { private static final String SINGLE_VALUED_FIELD_NAME = "s_value"; private static final String MULTI_VALUED_FIELD_NAME = "s_values"; - public static String randomExecutionHint() { - return randomBoolean() ? null : randomFrom(ExecutionMode.values()).toString(); - } - @Override public void setupSuiteScopeCluster() throws Exception { createIndex("idx"); @@ -133,7 +129,7 @@ public void singleValueField() throws Exception { .collectMode(randomFrom(SubAggCollectionMode.values()))) .execute().actionGet(); - assertSearchResponse(response);System.out.println(response); + assertSearchResponse(response); Terms terms = response.getAggregations().get("terms"); assertThat(terms, notNullValue()); @@ -1399,4 +1395,9 @@ public void indexMetaField() throws Exception { terms = response.getAggregations().get("terms"); assertEquals(5L, terms.getBucketByKey("i").getDocCount()); } + + @Test + public void otherDocCount() { + testOtherDocCount(SINGLE_VALUED_FIELD_NAME, MULTI_VALUED_FIELD_NAME); + } }