Skip to content

Commit

Permalink
Add histogram field type support to boxplot aggs (#52265)
Browse files Browse the repository at this point in the history
Add support for the histogram field type to boxplot aggs.

Closes #52233
Relates to #33112
  • Loading branch information
imotov committed Feb 13, 2020
1 parent 24e36ba commit 0898df4
Show file tree
Hide file tree
Showing 8 changed files with 76 additions and 27 deletions.
1 change: 1 addition & 0 deletions docs/reference/aggregations/metrics.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ include::metrics/valuecount-aggregation.asciidoc[]

include::metrics/median-absolute-deviation-aggregation.asciidoc[]

include::metrics/boxplot-aggregation.asciidoc[]



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
=== Boxplot Aggregation

A `boxplot` metrics aggregation that computes boxplot of numeric values extracted from the aggregated documents.
These values can be extracted either from specific numeric fields in the documents, or be generated by a provided script.
These values can be generated by a provided script or extracted from specific numeric or
<<histogram,histogram fields>> in the documents.

The `boxplot` aggregation returns essential information for making a https://en.wikipedia.org/wiki/Box_plot[box plot]: minimum, maximum
median, first quartile (25th percentile) and third quartile (75th percentile) values.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ GET latency/_search

<1> Compression controls memory usage and approximation error

// tag::[t-digest]
// tag::t-digest[]
The TDigest algorithm uses a number of "nodes" to approximate percentiles -- the
more nodes available, the higher the accuracy (and large memory footprint) proportional
to the volume of data. The `compression` parameter limits the maximum number of
Expand All @@ -301,7 +301,7 @@ A "node" uses roughly 32 bytes of memory, so under worst-case scenarios (large a
of data which arrives sorted and in-order) the default settings will produce a
TDigest roughly 64KB in size. In practice data tends to be more random and
the TDigest will use less memory.
// tag::[t-digest]
// end::t-digest[]

==== HDR Histogram

Expand Down
1 change: 1 addition & 0 deletions docs/reference/mapping/types/histogram.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ following aggregations and queries:

* <<search-aggregations-metrics-percentile-aggregation,percentiles>> aggregation
* <<search-aggregations-metrics-percentile-rank-aggregation,percentile ranks>> aggregation
* <<search-aggregations-metrics-boxplot-aggregation,boxplot>> aggregation
* <<query-dsl-exists-query,exists>> query

[[mapping-types-histogram-building-histogram]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@

import static org.elasticsearch.search.aggregations.metrics.PercentilesMethod.COMPRESSION_FIELD;

public class BoxplotAggregationBuilder extends ValuesSourceAggregationBuilder.LeafOnly<ValuesSource.Numeric,
public class BoxplotAggregationBuilder extends ValuesSourceAggregationBuilder.LeafOnly<ValuesSource,
BoxplotAggregationBuilder> {
public static final String NAME = "boxplot";

private static final ObjectParser<BoxplotAggregationBuilder, Void> PARSER;

static {
PARSER = new ObjectParser<>(BoxplotAggregationBuilder.NAME);
ValuesSourceParserHelper.declareNumericFields(PARSER, true, true, false);
ValuesSourceParserHelper.declareAnyFields(PARSER, true, true);
PARSER.declareDouble(BoxplotAggregationBuilder::compression, COMPRESSION_FIELD);
}

Expand Down Expand Up @@ -98,7 +98,7 @@ public double compression() {

@Override
protected BoxplotAggregatorFactory innerBuild(QueryShardContext queryShardContext,
ValuesSourceConfig<ValuesSource.Numeric> config,
ValuesSourceConfig<ValuesSource> config,
AggregatorFactory parent,
AggregatorFactories.Builder subFactoriesBuilder) throws IOException {
return new BoxplotAggregatorFactory(name, config, compression, queryShardContext, parent, subFactoriesBuilder, metaData);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import org.elasticsearch.common.lease.Releasables;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.ObjectArray;
import org.elasticsearch.index.fielddata.HistogramValue;
import org.elasticsearch.index.fielddata.HistogramValues;
import org.elasticsearch.index.fielddata.SortedNumericDoubleValues;
import org.elasticsearch.search.DocValueFormat;
import org.elasticsearch.search.aggregations.Aggregator;
Expand All @@ -29,12 +31,12 @@

public class BoxplotAggregator extends NumericMetricsAggregator.MultiValue {

private final ValuesSource.Numeric valuesSource;
private final ValuesSource valuesSource;
private final DocValueFormat format;
protected ObjectArray<TDigestState> states;
protected final double compression;

BoxplotAggregator(String name, ValuesSource.Numeric valuesSource, DocValueFormat formatter, double compression,
BoxplotAggregator(String name, ValuesSource valuesSource, DocValueFormat formatter, double compression,
SearchContext context, Aggregator parent, List<PipelineAggregator> pipelineAggregators,
Map<String, Object> metaData) throws IOException {
super(name, context, parent, pipelineAggregators, metaData);
Expand All @@ -58,23 +60,38 @@ public LeafBucketCollector getLeafCollector(LeafReaderContext ctx,
return LeafBucketCollector.NO_OP_COLLECTOR;
}
final BigArrays bigArrays = context.bigArrays();
final SortedNumericDoubleValues values = valuesSource.doubleValues(ctx);
return new LeafBucketCollectorBase(sub, values) {
@Override
public void collect(int doc, long bucket) throws IOException {
states = bigArrays.grow(states, bucket + 1);

if (values.advanceExact(doc)) {
if (valuesSource instanceof ValuesSource.Histogram) {
final HistogramValues values = ((ValuesSource.Histogram)valuesSource).getHistogramValues(ctx);
return new LeafBucketCollectorBase(sub, values) {
@Override
public void collect(int doc, long bucket) throws IOException {
TDigestState state = getExistingOrNewHistogram(bigArrays, bucket);
if (values.advanceExact(doc)) {
final int valueCount = values.docValueCount();
for (int i = 0; i < valueCount; i++) {
state.add(values.nextValue());
final HistogramValue sketch = values.histogram();
while(sketch.next()) {
state.add(sketch.value(), sketch.count());
}
}
}
}
};
};
} else {
final SortedNumericDoubleValues values = ((ValuesSource.Numeric)valuesSource).doubleValues(ctx);
return new LeafBucketCollectorBase(sub, values) {
@Override
public void collect(int doc, long bucket) throws IOException {
states = bigArrays.grow(states, bucket + 1);
if (values.advanceExact(doc)) {
TDigestState state = getExistingOrNewHistogram(bigArrays, bucket);
if (values.advanceExact(doc)) {
final int valueCount = values.docValueCount();
for (int i = 0; i < valueCount; i++) {
state.add(values.nextValue());
}
}
}
}
};
}
}

private TDigestState getExistingOrNewHistogram(final BigArrays bigArrays, long bucket) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@
import java.util.List;
import java.util.Map;

public class BoxplotAggregatorFactory extends ValuesSourceAggregatorFactory<ValuesSource.Numeric> {
public class BoxplotAggregatorFactory extends ValuesSourceAggregatorFactory<ValuesSource> {

private final double compression;

BoxplotAggregatorFactory(String name,
ValuesSourceConfig<ValuesSource.Numeric> config,
ValuesSourceConfig<ValuesSource> config,
double compression,
QueryShardContext queryShardContext,
AggregatorFactory parent,
Expand All @@ -46,7 +46,7 @@ protected Aggregator createUnmapped(SearchContext searchContext,
}

@Override
protected Aggregator doCreateInternal(ValuesSource.Numeric valuesSource,
protected Aggregator doCreateInternal(ValuesSource valuesSource,
SearchContext searchContext,
Aggregator parent,
boolean collectsFromSingleBucket,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import org.elasticsearch.search.aggregations.metrics.TDigestState;
import org.elasticsearch.test.ESSingleNodeTestCase;
import org.elasticsearch.xpack.analytics.AnalyticsPlugin;
import org.elasticsearch.xpack.analytics.boxplot.Boxplot;
import org.elasticsearch.xpack.analytics.boxplot.BoxplotAggregationBuilder;
import org.elasticsearch.xpack.core.LocalStateCompositeXPackPlugin;

import java.util.ArrayList;
Expand Down Expand Up @@ -131,8 +133,7 @@ public void testHDRHistogram() throws Exception {
}
}

public void testTDigestHistogram() throws Exception {

private void setupTDigestHistogram(int compression) throws Exception {
XContentBuilder xContentBuilder = XContentFactory.jsonBuilder()
.startObject()
.startObject("_doc")
Expand Down Expand Up @@ -170,8 +171,6 @@ public void testTDigestHistogram() throws Exception {
PutMappingRequest request2 = new PutMappingRequest("pre_agg").source(xContentBuilder2);
client().admin().indices().putMapping(request2).actionGet();


int compression = TestUtil.nextInt(random(), 200, 300);
TDigestState histogram = new TDigestState(compression);
BulkRequest bulkRequest = new BulkRequest();

Expand Down Expand Up @@ -218,6 +217,11 @@ public void testTDigestHistogram() throws Exception {

response = client().prepareSearch("pre_agg").get();
assertEquals(numDocs / frq, response.getHits().getTotalHits().value);
}

public void testTDigestHistogram() throws Exception {
int compression = TestUtil.nextInt(random(), 200, 300);
setupTDigestHistogram(compression);

PercentilesAggregationBuilder builder =
AggregationBuilders.percentiles("agg").field("inner.data").method(PercentilesMethod.TDIGEST)
Expand All @@ -236,6 +240,31 @@ public void testTDigestHistogram() throws Exception {
}
}

public void testBoxplotHistogram() throws Exception {
int compression = TestUtil.nextInt(random(), 200, 300);
setupTDigestHistogram(compression);
BoxplotAggregationBuilder bpBuilder = new BoxplotAggregationBuilder("agg").field("inner.data").compression(compression);

SearchResponse bpResponseRaw = client().prepareSearch("raw").addAggregation(bpBuilder).get();
SearchResponse bpResponsePreAgg = client().prepareSearch("pre_agg").addAggregation(bpBuilder).get();
SearchResponse bpResponseBoth = client().prepareSearch("raw", "pre_agg").addAggregation(bpBuilder).get();

Boxplot bpRaw = bpResponseRaw.getAggregations().get("agg");
Boxplot bpPreAgg = bpResponsePreAgg.getAggregations().get("agg");
Boxplot bpBoth = bpResponseBoth.getAggregations().get("agg");
assertEquals(bpRaw.getMax(), bpPreAgg.getMax(), 0.0);
assertEquals(bpRaw.getMax(), bpBoth.getMax(), 0.0);
assertEquals(bpRaw.getMin(), bpPreAgg.getMin(), 0.0);
assertEquals(bpRaw.getMin(), bpBoth.getMin(), 0.0);

assertEquals(bpRaw.getQ1(), bpPreAgg.getQ1(), 1.0);
assertEquals(bpRaw.getQ1(), bpBoth.getQ1(), 1.0);
assertEquals(bpRaw.getQ2(), bpPreAgg.getQ2(), 1.0);
assertEquals(bpRaw.getQ2(), bpBoth.getQ2(), 1.0);
assertEquals(bpRaw.getQ3(), bpPreAgg.getQ3(), 1.0);
assertEquals(bpRaw.getQ3(), bpBoth.getQ3(), 1.0);
}

@Override
protected Collection<Class<? extends Plugin>> getPlugins() {
List<Class<? extends Plugin>> plugins = new ArrayList<>(super.getPlugins());
Expand Down

0 comments on commit 0898df4

Please sign in to comment.