From 5c6cc23d352f291f216cdb4fd58da9cf53f7ea1e Mon Sep 17 00:00:00 2001 From: Russ Cam Date: Thu, 6 Feb 2020 11:38:15 +1000 Subject: [PATCH] Add StringStats aggregation This commit adds support for the string_stats aggregation introduced in Elasticsearch 7.6.0. It is a metric aggregation but does not implement IMetricAggregation because the type of Missing field is a string and not a double value. Missing is implemented as object as the hierarchy of metric aggregations will be changed as part of #4332, and Missing will be an object. The documentation for StringStats indicates that the distribution is returned in descending probability order, but are modelled as a JSON object. Following internal discussion, this is modelled as a dictionary on the response as it is considered this modelling will not diminish functionality. Closes #4369 --- src/Nest/Aggregations/AggregateDictionary.cs | 2 + src/Nest/Aggregations/AggregateFormatter.cs | 61 +++++++++- src/Nest/Aggregations/AggregationContainer.cs | 13 +++ .../StringStats/StringStatsAggregate.cs | 40 +++++++ .../StringStats/StringStatsAggregation.cs | 104 ++++++++++++++++++ .../Visitor/AggregationVisitor.cs | 4 + .../StringStatsAggregationUsageTests.cs | 92 ++++++++++++++++ 7 files changed, 312 insertions(+), 4 deletions(-) create mode 100644 src/Nest/Aggregations/Metric/StringStats/StringStatsAggregate.cs create mode 100644 src/Nest/Aggregations/Metric/StringStats/StringStatsAggregation.cs create mode 100644 tests/Tests/Aggregations/Metric/StringStats/StringStatsAggregationUsageTests.cs diff --git a/src/Nest/Aggregations/AggregateDictionary.cs b/src/Nest/Aggregations/AggregateDictionary.cs index 4ce1fc131b..defa6e149c 100644 --- a/src/Nest/Aggregations/AggregateDictionary.cs +++ b/src/Nest/Aggregations/AggregateDictionary.cs @@ -78,6 +78,8 @@ public ScriptedMetricAggregate ScriptedMetric(string key) public StatsAggregate Stats(string key) => TryGet(key); + public StringStatsAggregate StringStats(string key) => TryGet(key); + public StatsAggregate StatsBucket(string key) => TryGet(key); public ExtendedStatsAggregate ExtendedStats(string key) => TryGet(key); diff --git a/src/Nest/Aggregations/AggregateFormatter.cs b/src/Nest/Aggregations/AggregateFormatter.cs index d5d1bbfa72..406802df35 100644 --- a/src/Nest/Aggregations/AggregateFormatter.cs +++ b/src/Nest/Aggregations/AggregateFormatter.cs @@ -36,6 +36,7 @@ internal class AggregateFormatter : IJsonFormatter private static readonly byte[] KeysField = JsonWriter.GetEncodedPropertyNameWithoutQuotation(Parser.Keys); private static readonly byte[] MetaField = JsonWriter.GetEncodedPropertyNameWithoutQuotation(Parser.Meta); + private static readonly byte[] MinLengthField = JsonWriter.GetEncodedPropertyNameWithoutQuotation(Parser.MinLength); private static readonly AutomataDictionary RootFields = new AutomataDictionary { @@ -133,7 +134,7 @@ private IAggregate ReadAggregate(ref JsonReader reader, IJsonFormatterResolver f aggregate = GetMultiBucketAggregate(ref reader, formatterResolver, ref propertyName, meta); break; case 5: - aggregate = GetStatsAggregate(ref reader, meta); + aggregate = GetStatsAggregate(ref reader, formatterResolver, meta); break; case 6: aggregate = GetSingleBucketAggregate(ref reader, formatterResolver, meta); @@ -397,7 +398,52 @@ private IAggregate GetSingleBucketAggregate(ref JsonReader reader, IJsonFormatte return new SingleBucketAggregate(subAggregates) { DocCount = docCount, Meta = meta }; } - private IAggregate GetStatsAggregate(ref JsonReader reader, IReadOnlyDictionary meta) + private IAggregate GetStringStatsAggregate(ref JsonReader reader, IJsonFormatterResolver formatterResolver, + IReadOnlyDictionary meta, long count + ) + { + // string stats aggregation + var minLength = reader.ReadInt32(); + reader.ReadNext(); // , + reader.ReadNext(); // "max_length" + reader.ReadNext(); // : + var maxLength = reader.ReadInt32(); + reader.ReadNext(); // , + reader.ReadNext(); // "avg_length" + reader.ReadNext(); // : + var avgLength = reader.ReadDouble(); + reader.ReadNext(); // , + reader.ReadNext(); // "entropy" + reader.ReadNext(); // : + var entropy = reader.ReadDouble(); + + var aggregate = new StringStatsAggregate + { + Meta = meta, + Count = count, + MinLength = minLength, + MaxLength = maxLength, + AverageLength = avgLength, + Entropy = entropy + }; + + if (reader.ReadIsValueSeparator()) + { + reader.ReadNext(); // "distribution" + reader.ReadNext(); // : + var distribution = formatterResolver + .GetFormatter>() + .Deserialize(ref reader, formatterResolver); + + // only set distribution if present, leaving empty dictionary when absent + aggregate.Distribution = distribution; + } + + return aggregate; + } + + private IAggregate GetStatsAggregate(ref JsonReader reader, IJsonFormatterResolver formatterResolver, IReadOnlyDictionary meta + ) { var count = reader.ReadNullableLong().GetValueOrDefault(0); @@ -405,8 +451,14 @@ private IAggregate GetStatsAggregate(ref JsonReader reader, IReadOnlyDictionary< return new GeoCentroidAggregate { Count = count, Meta = meta }; reader.ReadNext(); // , - reader.ReadNext(); // "min" - reader.ReadNext(); // : + + var property = reader.ReadPropertyNameSegmentRaw(); + + // string stats aggregation + if (property.EqualsBytes(MinLengthField)) + return GetStringStatsAggregate(ref reader, formatterResolver, meta, count); + + // stats or extended stats aggregation var min = reader.ReadNullableDouble(); reader.ReadNext(); // , reader.ReadNext(); // "max" @@ -930,6 +982,7 @@ private static class Parser public const string Location = "location"; public const string MaxScore = "max_score"; public const string Meta = "meta"; + public const string MinLength = "min_length"; public const string Score = "score"; diff --git a/src/Nest/Aggregations/AggregationContainer.cs b/src/Nest/Aggregations/AggregationContainer.cs index b9b68496bd..e342a76c42 100644 --- a/src/Nest/Aggregations/AggregationContainer.cs +++ b/src/Nest/Aggregations/AggregationContainer.cs @@ -259,6 +259,9 @@ public interface IAggregationContainer [DataMember(Name = "median_absolute_deviation")] IMedianAbsoluteDeviationAggregation MedianAbsoluteDeviation { get; set; } + [DataMember(Name = "string_stats")] + IStringStatsAggregation StringStats { get; set; } + void Accept(IAggregationVisitor visitor); } @@ -377,6 +380,8 @@ public class AggregationContainer : IAggregationContainer public IMedianAbsoluteDeviationAggregation MedianAbsoluteDeviation { get; set; } + public IStringStatsAggregation StringStats { get; set; } + public void Accept(IAggregationVisitor visitor) { if (visitor.Scope == AggregationVisitorScope.Unknown) visitor.Scope = AggregationVisitorScope.Aggregation; @@ -526,6 +531,8 @@ public class AggregationContainerDescriptor : DescriptorBase _SetInnerAggregation(name, selector, (a, d) => a.MedianAbsoluteDeviation = d); + /// + public AggregationContainerDescriptor StringStats(string name, + Func, IStringStatsAggregation> selector + ) => + _SetInnerAggregation(name, selector, (a, d) => a.StringStats = d); + /// /// Fluent methods do not assign to properties on `this` directly but on IAggregationContainers inside /// `this.Aggregations[string, IContainer] diff --git a/src/Nest/Aggregations/Metric/StringStats/StringStatsAggregate.cs b/src/Nest/Aggregations/Metric/StringStats/StringStatsAggregate.cs new file mode 100644 index 0000000000..f1387d4872 --- /dev/null +++ b/src/Nest/Aggregations/Metric/StringStats/StringStatsAggregate.cs @@ -0,0 +1,40 @@ +using System.Collections.Generic; +using Elasticsearch.Net; + +namespace Nest +{ + public class StringStatsAggregate : MetricAggregateBase + { + /// + /// The average length computed over all terms. + /// + public double AverageLength { get; set; } + + /// + /// The number of non-empty fields counted. + /// + public long Count { get; set; } + + /// + /// The length of the longest term. + /// + public int MaxLength { get; set; } + + /// + /// The length of the shortest term. + /// + public int MinLength { get; set; } + + /// + /// The Shannon Entropy value computed over all terms collected by the aggregation. + /// Shannon entropy quantifies the amount of information contained in the field. + /// It is a very useful metric for measuring a wide range of properties of a data set, such as diversity, similarity, randomness etc. + /// + public double Entropy { get; set; } + + /// + /// The probability of each character appearing in all terms. + /// + public IReadOnlyDictionary Distribution { get; set; } = EmptyReadOnly.Dictionary; + } +} diff --git a/src/Nest/Aggregations/Metric/StringStats/StringStatsAggregation.cs b/src/Nest/Aggregations/Metric/StringStats/StringStatsAggregation.cs new file mode 100644 index 0000000000..1d4661b451 --- /dev/null +++ b/src/Nest/Aggregations/Metric/StringStats/StringStatsAggregation.cs @@ -0,0 +1,104 @@ +using System; +using System.Collections.Generic; +using System.Linq.Expressions; +using System.Runtime.Serialization; +using Elasticsearch.Net.Utf8Json; + +namespace Nest +{ + /// + /// A multi-value metrics aggregation that computes statistics over string values extracted from the aggregated documents. + /// These values can be retrieved either from specific keyword fields in the documents or can be generated by a provided script. + /// + /// Available in Elasticsearch 7.6.0+ with at least basic license level + /// + [InterfaceDataContract] + [ReadAs(typeof(StringStatsAggregation))] + public interface IStringStatsAggregation : IAggregation + { + /// + /// The field to perform the aggregation on + /// + [DataMember(Name = "field")] + Field Field { get; set; } + + /// + /// A value to use for documents missing a value for the field + /// + [DataMember(Name = "missing")] + object Missing { get; set; } + + /// + /// Compute the string stats based on a script + /// + [DataMember(Name = "script")] + IScript Script { get; set; } + + /// + /// Include the probability distribution for all characters in the response. + /// + [DataMember(Name = "show_distribution")] + bool? ShowDistribution { get; set; } + } + + /// + public class StringStatsAggregation : AggregationBase, IStringStatsAggregation + { + internal StringStatsAggregation() { } + + public StringStatsAggregation(string name, Field field) : base(name) => Field = field; + + internal override void WrapInContainer(AggregationContainer c) => c.StringStats = this; + + /// + public Field Field { get; set; } + + /// + public object Missing { get; set; } + + /// + public IScript Script { get; set; } + + /// + public bool? ShowDistribution { get; set; } + } + + /// + public class StringStatsAggregationDescriptor + : DescriptorBase, IStringStatsAggregation>, IStringStatsAggregation + where T : class + { + Field IStringStatsAggregation.Field { get; set; } + IDictionary IAggregation.Meta { get; set; } + object IStringStatsAggregation.Missing { get; set; } + string IAggregation.Name { get; set; } + + IScript IStringStatsAggregation.Script { get; set; } + + bool? IStringStatsAggregation.ShowDistribution { get; set; } + + /// + public StringStatsAggregationDescriptor Field(Field field) => Assign(field, (a, v) => a.Field = v); + + /// + public StringStatsAggregationDescriptor Field(Expression> field) => Assign(field, (a, v) => a.Field = v); + + /// + public StringStatsAggregationDescriptor Script(string script) => Assign((InlineScript)script, (a, v) => a.Script = v); + + /// + public StringStatsAggregationDescriptor Script(Func scriptSelector) => + Assign(scriptSelector, (a, v) => a.Script = v?.Invoke(new ScriptDescriptor())); + + /// + public StringStatsAggregationDescriptor Missing(object missing) => Assign(missing, (a, v) => a.Missing = v); + + /// + public StringStatsAggregationDescriptor Meta(Func, FluentDictionary> selector) => + Assign(selector, (a, v) => a.Meta = v?.Invoke(new FluentDictionary())); + + /// + public StringStatsAggregationDescriptor ShowDistribution(bool? showDistribution = true) => + Assign(showDistribution, (a, v) => a.ShowDistribution = v); + } +} diff --git a/src/Nest/Aggregations/Visitor/AggregationVisitor.cs b/src/Nest/Aggregations/Visitor/AggregationVisitor.cs index d6e2bdc202..83a49e95b8 100644 --- a/src/Nest/Aggregations/Visitor/AggregationVisitor.cs +++ b/src/Nest/Aggregations/Visitor/AggregationVisitor.cs @@ -127,6 +127,8 @@ public interface IAggregationVisitor void Visit(ICompositeAggregation aggregation); void Visit(IMedianAbsoluteDeviationAggregation aggregation); + + void Visit(IStringStatsAggregation aggregation); } public class AggregationVisitor : IAggregationVisitor @@ -239,6 +241,8 @@ public class AggregationVisitor : IAggregationVisitor public virtual void Visit(IMedianAbsoluteDeviationAggregation aggregation) { } + public virtual void Visit(IStringStatsAggregation aggregation) { } + public virtual void Visit(IAggregation aggregation) { } public virtual void Visit(IAggregationContainer aggregationContainer) { } diff --git a/tests/Tests/Aggregations/Metric/StringStats/StringStatsAggregationUsageTests.cs b/tests/Tests/Aggregations/Metric/StringStats/StringStatsAggregationUsageTests.cs new file mode 100644 index 0000000000..1b5b6f46d7 --- /dev/null +++ b/tests/Tests/Aggregations/Metric/StringStats/StringStatsAggregationUsageTests.cs @@ -0,0 +1,92 @@ +using System; +using Elastic.Xunit.XunitPlumbing; +using FluentAssertions; +using Nest; +using Tests.Core.Extensions; +using Tests.Core.ManagedElasticsearch.Clusters; +using Tests.Domain; +using Tests.Framework.EndpointTests.TestState; +using static Nest.Infer; + +namespace Tests.Aggregations.Metric.StringStats +{ + [SkipVersion("<7.6.0", "Available in 7.6.0 with at least basic license level")] + public class StringStatsAggregationUsageTests : AggregationUsageTestBase + { + public StringStatsAggregationUsageTests(ReadOnlyCluster i, EndpointUsage usage) : base(i, usage) { } + + protected override object AggregationJson => new + { + name_stats = new + { + string_stats = new + { + field = "name" + } + } + }; + + protected override Func, IAggregationContainer> FluentAggs => a => a + .StringStats("name_stats", st => st + .Field(p => p.Name) + ); + + protected override AggregationDictionary InitializerAggs => + new StringStatsAggregation("name_stats", Field(p => p.Name)); + + protected override void ExpectResponse(ISearchResponse response) + { + response.ShouldBeValid(); + var commitStats = response.Aggregations.StringStats("name_stats"); + commitStats.Should().NotBeNull(); + commitStats.AverageLength.Should().BeGreaterThan(0); + commitStats.MaxLength.Should().BeGreaterThan(0); + commitStats.MinLength.Should().BeGreaterThan(0); + commitStats.Count.Should().BeGreaterThan(0); + commitStats.Distribution.Should().NotBeNull().And.BeEmpty(); + } + } + + // hide + [SkipVersion("<7.6.0", "Available in 7.6.0 with at least basic license level")] + public class StringStatsWithDistributionAggregationUsageTests : AggregationUsageTestBase + { + public StringStatsWithDistributionAggregationUsageTests(ReadOnlyCluster i, EndpointUsage usage) : base(i, usage) { } + + protected override object AggregationJson => new + { + name_stats = new + { + string_stats = new + { + field = "name", + show_distribution = true + } + } + }; + + protected override Func, IAggregationContainer> FluentAggs => a => a + .StringStats("name_stats", st => st + .Field(p => p.Name) + .ShowDistribution() + ); + + protected override AggregationDictionary InitializerAggs => + new StringStatsAggregation("name_stats", Field(p => p.Name)) + { + ShowDistribution = true + }; + + protected override void ExpectResponse(ISearchResponse response) + { + response.ShouldBeValid(); + var commitStats = response.Aggregations.StringStats("name_stats"); + commitStats.Should().NotBeNull(); + commitStats.AverageLength.Should().BeGreaterThan(0); + commitStats.MaxLength.Should().BeGreaterThan(0); + commitStats.MinLength.Should().BeGreaterThan(0); + commitStats.Count.Should().BeGreaterThan(0); + commitStats.Distribution.Should().NotBeNull().And.NotBeEmpty(); + } + } +}