Skip to content

Commit

Permalink
Add StringStats aggregation (#4370)
Browse files Browse the repository at this point in the history
This commit adds support for the string_stats aggregation introduced in
Elasticsearch 7.6.0. It is a metric aggregation but does not implement
IMetricAggregation because the type of Missing field is a string and not a
double value. Missing is implemented as object as the hierarchy of
metric aggregations will be changed as part of #4332, and Missing
will be an object.

The documentation for StringStats indicates that the distribution is returned
in descending probability order, but are modelled as a JSON object. Following
internal discussion, this is modelled as a dictionary on the response as it is
considered this modelling will not diminish functionality.

Closes #4369

(cherry picked from commit cc45fb1)
  • Loading branch information
russcam committed Feb 10, 2020
1 parent a50f8a1 commit 9917ff6
Show file tree
Hide file tree
Showing 7 changed files with 312 additions and 4 deletions.
2 changes: 2 additions & 0 deletions src/Nest/Aggregations/AggregateDictionary.cs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ public ScriptedMetricAggregate ScriptedMetric(string key)

public StatsAggregate Stats(string key) => TryGet<StatsAggregate>(key);

public StringStatsAggregate StringStats(string key) => TryGet<StringStatsAggregate>(key);

public StatsAggregate StatsBucket(string key) => TryGet<StatsAggregate>(key);

public ExtendedStatsAggregate ExtendedStats(string key) => TryGet<ExtendedStatsAggregate>(key);
Expand Down
61 changes: 57 additions & 4 deletions src/Nest/Aggregations/AggregateFormatter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ internal class AggregateFormatter : IJsonFormatter<IAggregate>

private static readonly byte[] KeysField = JsonWriter.GetEncodedPropertyNameWithoutQuotation(Parser.Keys);
private static readonly byte[] MetaField = JsonWriter.GetEncodedPropertyNameWithoutQuotation(Parser.Meta);
private static readonly byte[] MinLengthField = JsonWriter.GetEncodedPropertyNameWithoutQuotation(Parser.MinLength);

private static readonly AutomataDictionary RootFields = new AutomataDictionary
{
Expand Down Expand Up @@ -133,7 +134,7 @@ private IAggregate ReadAggregate(ref JsonReader reader, IJsonFormatterResolver f
aggregate = GetMultiBucketAggregate(ref reader, formatterResolver, ref propertyName, meta);
break;
case 5:
aggregate = GetStatsAggregate(ref reader, meta);
aggregate = GetStatsAggregate(ref reader, formatterResolver, meta);
break;
case 6:
aggregate = GetSingleBucketAggregate(ref reader, formatterResolver, meta);
Expand Down Expand Up @@ -397,16 +398,67 @@ private IAggregate GetSingleBucketAggregate(ref JsonReader reader, IJsonFormatte
return new SingleBucketAggregate(subAggregates) { DocCount = docCount, Meta = meta };
}

private IAggregate GetStatsAggregate(ref JsonReader reader, IReadOnlyDictionary<string, object> meta)
private IAggregate GetStringStatsAggregate(ref JsonReader reader, IJsonFormatterResolver formatterResolver,
IReadOnlyDictionary<string, object> meta, long count
)
{
// string stats aggregation
var minLength = reader.ReadInt32();
reader.ReadNext(); // ,
reader.ReadNext(); // "max_length"
reader.ReadNext(); // :
var maxLength = reader.ReadInt32();
reader.ReadNext(); // ,
reader.ReadNext(); // "avg_length"
reader.ReadNext(); // :
var avgLength = reader.ReadDouble();
reader.ReadNext(); // ,
reader.ReadNext(); // "entropy"
reader.ReadNext(); // :
var entropy = reader.ReadDouble();

var aggregate = new StringStatsAggregate
{
Meta = meta,
Count = count,
MinLength = minLength,
MaxLength = maxLength,
AverageLength = avgLength,
Entropy = entropy
};

if (reader.ReadIsValueSeparator())
{
reader.ReadNext(); // "distribution"
reader.ReadNext(); // :
var distribution = formatterResolver
.GetFormatter<IReadOnlyDictionary<string, double>>()
.Deserialize(ref reader, formatterResolver);

// only set distribution if present, leaving empty dictionary when absent
aggregate.Distribution = distribution;
}

return aggregate;
}

private IAggregate GetStatsAggregate(ref JsonReader reader, IJsonFormatterResolver formatterResolver, IReadOnlyDictionary<string, object> meta
)
{
var count = reader.ReadNullableLong().GetValueOrDefault(0);

if (reader.GetCurrentJsonToken() == JsonToken.EndObject)
return new GeoCentroidAggregate { Count = count, Meta = meta };

reader.ReadNext(); // ,
reader.ReadNext(); // "min"
reader.ReadNext(); // :

var property = reader.ReadPropertyNameSegmentRaw();

// string stats aggregation
if (property.EqualsBytes(MinLengthField))
return GetStringStatsAggregate(ref reader, formatterResolver, meta, count);

// stats or extended stats aggregation
var min = reader.ReadNullableDouble();
reader.ReadNext(); // ,
reader.ReadNext(); // "max"
Expand Down Expand Up @@ -930,6 +982,7 @@ private static class Parser
public const string Location = "location";
public const string MaxScore = "max_score";
public const string Meta = "meta";
public const string MinLength = "min_length";

public const string Score = "score";

Expand Down
13 changes: 13 additions & 0 deletions src/Nest/Aggregations/AggregationContainer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,9 @@ public interface IAggregationContainer
[DataMember(Name = "median_absolute_deviation")]
IMedianAbsoluteDeviationAggregation MedianAbsoluteDeviation { get; set; }

[DataMember(Name = "string_stats")]
IStringStatsAggregation StringStats { get; set; }

void Accept(IAggregationVisitor visitor);
}

Expand Down Expand Up @@ -377,6 +380,8 @@ public class AggregationContainer : IAggregationContainer

public IMedianAbsoluteDeviationAggregation MedianAbsoluteDeviation { get; set; }

public IStringStatsAggregation StringStats { get; set; }

public void Accept(IAggregationVisitor visitor)
{
if (visitor.Scope == AggregationVisitorScope.Unknown) visitor.Scope = AggregationVisitorScope.Aggregation;
Expand Down Expand Up @@ -526,6 +531,8 @@ public class AggregationContainerDescriptor<T> : DescriptorBase<AggregationConta

IMedianAbsoluteDeviationAggregation IAggregationContainer.MedianAbsoluteDeviation { get; set; }

IStringStatsAggregation IAggregationContainer.StringStats { get; set; }

public void Accept(IAggregationVisitor visitor)
{
if (visitor.Scope == AggregationVisitorScope.Unknown) visitor.Scope = AggregationVisitorScope.Aggregation;
Expand Down Expand Up @@ -818,6 +825,12 @@ Func<MedianAbsoluteDeviationAggregationDescriptor<T>, IMedianAbsoluteDeviationAg
) =>
_SetInnerAggregation(name, selector, (a, d) => a.MedianAbsoluteDeviation = d);

/// <inheritdoc cref="IStringStatsAggregation"/>
public AggregationContainerDescriptor<T> StringStats(string name,
Func<StringStatsAggregationDescriptor<T>, IStringStatsAggregation> selector
) =>
_SetInnerAggregation(name, selector, (a, d) => a.StringStats = d);

/// <summary>
/// Fluent methods do not assign to properties on `this` directly but on IAggregationContainers inside
/// `this.Aggregations[string, IContainer]
Expand Down
40 changes: 40 additions & 0 deletions src/Nest/Aggregations/Metric/StringStats/StringStatsAggregate.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
using System.Collections.Generic;
using Elasticsearch.Net;

namespace Nest
{
public class StringStatsAggregate : MetricAggregateBase
{
/// <summary>
/// The average length computed over all terms.
/// </summary>
public double AverageLength { get; set; }

/// <summary>
/// The number of non-empty fields counted.
/// </summary>
public long Count { get; set; }

/// <summary>
/// The length of the longest term.
/// </summary>
public int MaxLength { get; set; }

/// <summary>
/// The length of the shortest term.
/// </summary>
public int MinLength { get; set; }

/// <summary>
/// The Shannon Entropy value computed over all terms collected by the aggregation.
/// Shannon entropy quantifies the amount of information contained in the field.
/// It is a very useful metric for measuring a wide range of properties of a data set, such as diversity, similarity, randomness etc.
/// </summary>
public double Entropy { get; set; }

/// <summary>
/// The probability of each character appearing in all terms.
/// </summary>
public IReadOnlyDictionary<string, double> Distribution { get; set; } = EmptyReadOnly<string, double>.Dictionary;
}
}
104 changes: 104 additions & 0 deletions src/Nest/Aggregations/Metric/StringStats/StringStatsAggregation.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
using System;
using System.Collections.Generic;
using System.Linq.Expressions;
using System.Runtime.Serialization;
using Elasticsearch.Net.Utf8Json;

namespace Nest
{
/// <summary>
/// A multi-value metrics aggregation that computes statistics over string values extracted from the aggregated documents.
/// These values can be retrieved either from specific keyword fields in the documents or can be generated by a provided script.
/// <para />
/// Available in Elasticsearch 7.6.0+ with at least basic license level
/// </summary>
[InterfaceDataContract]
[ReadAs(typeof(StringStatsAggregation))]
public interface IStringStatsAggregation : IAggregation
{
/// <summary>
/// The field to perform the aggregation on
/// </summary>
[DataMember(Name = "field")]
Field Field { get; set; }

/// <summary>
/// A value to use for documents missing a value for the field
/// </summary>
[DataMember(Name = "missing")]
object Missing { get; set; }

/// <summary>
/// Compute the string stats based on a script
/// </summary>
[DataMember(Name = "script")]
IScript Script { get; set; }

/// <summary>
/// Include the probability distribution for all characters in the response.
/// </summary>
[DataMember(Name = "show_distribution")]
bool? ShowDistribution { get; set; }
}

/// <inheritdoc cref="IStringStatsAggregation"/>
public class StringStatsAggregation : AggregationBase, IStringStatsAggregation
{
internal StringStatsAggregation() { }

public StringStatsAggregation(string name, Field field) : base(name) => Field = field;

internal override void WrapInContainer(AggregationContainer c) => c.StringStats = this;

/// <inheritdoc />
public Field Field { get; set; }

/// <inheritdoc />
public object Missing { get; set; }

/// <inheritdoc />
public IScript Script { get; set; }

/// <inheritdoc />
public bool? ShowDistribution { get; set; }
}

/// <inheritdoc cref="IStringStatsAggregation"/>
public class StringStatsAggregationDescriptor<T>
: DescriptorBase<StringStatsAggregationDescriptor<T>, IStringStatsAggregation>, IStringStatsAggregation
where T : class
{
Field IStringStatsAggregation.Field { get; set; }
IDictionary<string, object> IAggregation.Meta { get; set; }
object IStringStatsAggregation.Missing { get; set; }
string IAggregation.Name { get; set; }

IScript IStringStatsAggregation.Script { get; set; }

bool? IStringStatsAggregation.ShowDistribution { get; set; }

/// <inheritdoc cref="IStringStatsAggregation.Field"/>
public StringStatsAggregationDescriptor<T> Field(Field field) => Assign(field, (a, v) => a.Field = v);

/// <inheritdoc cref="IStringStatsAggregation.Field"/>
public StringStatsAggregationDescriptor<T> Field<TValue>(Expression<Func<T, TValue>> field) => Assign(field, (a, v) => a.Field = v);

/// <inheritdoc cref="IStringStatsAggregation.Script"/>
public StringStatsAggregationDescriptor<T> Script(string script) => Assign((InlineScript)script, (a, v) => a.Script = v);

/// <inheritdoc cref="IStringStatsAggregation.Script"/>
public StringStatsAggregationDescriptor<T> Script(Func<ScriptDescriptor, IScript> scriptSelector) =>
Assign(scriptSelector, (a, v) => a.Script = v?.Invoke(new ScriptDescriptor()));

/// <inheritdoc cref="IStringStatsAggregation.Missing"/>
public StringStatsAggregationDescriptor<T> Missing(object missing) => Assign(missing, (a, v) => a.Missing = v);

/// <inheritdoc cref="IAggregation.Meta"/>
public StringStatsAggregationDescriptor<T> Meta(Func<FluentDictionary<string, object>, FluentDictionary<string, object>> selector) =>
Assign(selector, (a, v) => a.Meta = v?.Invoke(new FluentDictionary<string, object>()));

/// <inheritdoc cref="IStringStatsAggregation.ShowDistribution"/>
public StringStatsAggregationDescriptor<T> ShowDistribution(bool? showDistribution = true) =>
Assign(showDistribution, (a, v) => a.ShowDistribution = v);
}
}
4 changes: 4 additions & 0 deletions src/Nest/Aggregations/Visitor/AggregationVisitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ public interface IAggregationVisitor
void Visit(ICompositeAggregation aggregation);

void Visit(IMedianAbsoluteDeviationAggregation aggregation);

void Visit(IStringStatsAggregation aggregation);
}

public class AggregationVisitor : IAggregationVisitor
Expand Down Expand Up @@ -239,6 +241,8 @@ public virtual void Visit(ICompositeAggregation aggregation) { }

public virtual void Visit(IMedianAbsoluteDeviationAggregation aggregation) { }

public virtual void Visit(IStringStatsAggregation aggregation) { }

public virtual void Visit(IAggregation aggregation) { }

public virtual void Visit(IAggregationContainer aggregationContainer) { }
Expand Down
Loading

0 comments on commit 9917ff6

Please sign in to comment.