Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions src/explorer.api/Explorers/BoolColumnExplorer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,32 +27,30 @@ public override async Task Explore(CancellationToken cancellationToken)
new DistinctColumnValues(TableName, ColumnName),
cancellationToken);

var (totalValueCount, suppressedValueCount) = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
var counts = distinctValuesQ.ResultRows.CountTotalAndSuppressed();

PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: suppressedValueCount));
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: counts.SuppressedCount));

// This shouldn't happen, but check anyway.
if (totalValueCount == 0)
if (counts.TotalCount == 0)
{
throw new Exception(
$"Total value count for {TableName}, {ColumnName} is zero.");
}

PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: totalValueCount));

var suppressedValueRatio = (double)suppressedValueCount / totalValueCount;
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: counts.TotalCount));

var distinctValueCounts =
from row in distinctValuesQ.ResultRows
where !row.DistinctData.IsSuppressed
where row.DistinctData.HasValue
orderby row.Count descending
select new
{
row.DistinctData.Value,
row.Count,
};

PublishMetric(new UntypedMetric(name: "distinct.values", metric: distinctValueCounts));
PublishMetric(new UntypedMetric(name: "distinct.top_values", metric: distinctValueCounts.Take(10)));
}
}
}
12 changes: 5 additions & 7 deletions src/explorer.api/Explorers/CategoricalColumnExplorer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,22 @@ public override async Task Explore(CancellationToken cancellationToken)
new DistinctColumnValues(TableName, ColumnName),
cancellationToken);

var (totalValueCount, suppressedValueCount) = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
var counts = distinctValuesQ.ResultRows.CountTotalAndSuppressed();

PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: suppressedValueCount));
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: counts.SuppressedCount));

// This shouldn't happen, but check anyway.
if (totalValueCount == 0)
if (counts.TotalCount == 0)
{
throw new Exception(
$"Total value count for {TableName}, {ColumnName} is zero.");
}

PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: totalValueCount));

var suppressedValueRatio = (double)suppressedValueCount / totalValueCount;
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: counts.TotalCount));

var distinctValueCounts =
from row in distinctValuesQ.ResultRows
where !row.DistinctData.IsSuppressed && !row.IsNull
where row.DistinctData.HasValue
orderby row.Count descending
select new
{
Expand Down
16 changes: 6 additions & 10 deletions src/explorer.api/Explorers/DatetimeColumnExplorer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -146,17 +146,15 @@ private void ProcessLinearBuckets(
var valueCounts = group
.Select(row => new AircloakValueCount<DateTime>(row.GroupingValue, row.Count, row.CountNoise));

var (totalCount, suppressedCount) = valueCounts.CountTotalAndSuppressed();
var counts = valueCounts.CountTotalAndSuppressed();

var suppressedRatio = (double)suppressedCount / totalCount;

if (suppressedRatio > SuppressedRatioThreshold)
if (counts.SuppressedCountRatio > SuppressedRatioThreshold)
{
break;
}

PublishMetric(new UntypedMetric(name: $"dates_linear.{label}", metric: DatetimeMetric(
totalCount, suppressedCount, valueCounts)));
counts.TotalCount, counts.SuppressedCount, valueCounts)));
}
}

Expand Down Expand Up @@ -189,17 +187,15 @@ private void ProcessCyclicalBuckets(
var valueCounts = group
.Select(row => new AircloakValueCount<int>(row.GroupingValue, row.Count, row.CountNoise));

var (totalCount, suppressedCount) = valueCounts.CountTotalAndSuppressed();

var suppressedRatio = (double)suppressedCount / totalCount;
var counts = valueCounts.CountTotalAndSuppressed();

if (suppressedRatio > SuppressedRatioThreshold)
if (counts.SuppressedCountRatio > SuppressedRatioThreshold)
{
break;
}

PublishMetric(new UntypedMetric(name: $"dates_cyclical.{label}", metric: DatetimeMetric(
totalCount, suppressedCount, valueCounts)));
counts.TotalCount, counts.SuppressedCount, valueCounts)));
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/explorer.api/Explorers/EmailColumnExplorer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ public override async Task Explore(CancellationToken cancellationToken)
new TextColumnTrim(TableName, ColumnName, TextColumnTrimType.Both, EmailAddressChars),
cancellationToken);

var (totalValueCount, suppressedValueCount) = emailCheckQ.ResultRows.CountTotalAndSuppressed();
var counts = emailCheckQ.ResultRows.CountTotalAndSuppressed();

var isEmail = totalValueCount == emailCheckQ.ResultRows
var isEmail = counts.TotalCount == emailCheckQ.ResultRows
.Where(r => r.TrimmedText == "@" || r.IsNull)
.Sum(r => r.Count);

Expand Down
78 changes: 71 additions & 7 deletions src/explorer.api/Explorers/TextColumnExplorer.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
namespace Explorer
{
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;
Expand All @@ -10,6 +11,8 @@ namespace Explorer

internal class TextColumnExplorer : ExplorerBase
{
private const double SuppressedRatioThreshold = 0.1;

public TextColumnExplorer(IQueryResolver queryResolver, string tableName, string columnName)
: base(queryResolver)
{
Expand All @@ -27,24 +30,22 @@ public override async Task Explore(CancellationToken cancellationToken)
new DistinctColumnValues(TableName, ColumnName),
cancellationToken);

var (totalValueCount, suppressedValueCount) = distinctValuesQ.ResultRows.CountTotalAndSuppressed();
var counts = distinctValuesQ.ResultRows.CountTotalAndSuppressed();

PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: suppressedValueCount));
PublishMetric(new UntypedMetric(name: "distinct.suppressed_count", metric: counts.SuppressedCount));

// This shouldn't happen, but check anyway.
if (totalValueCount == 0)
if (counts.TotalCount == 0)
{
throw new Exception(
$"Total value count for {TableName}, {ColumnName} is zero.");
}

PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: totalValueCount));

var suppressedValueRatio = (double)suppressedValueCount / totalValueCount;
PublishMetric(new UntypedMetric(name: "distinct.total_count", metric: counts.TotalCount));

var distinctValueCounts =
from row in distinctValuesQ.ResultRows
where !row.DistinctData.IsSuppressed && !row.DistinctData.IsNull
where row.DistinctData.HasValue
orderby row.Count descending
select new
{
Expand All @@ -53,6 +54,69 @@ orderby row.Count descending
};

PublishMetric(new UntypedMetric(name: "distinct.top_values", metric: distinctValueCounts.Take(10)));

if (counts.SuppressedCountRatio >= SuppressedRatioThreshold)
{
// we compute the common prefixes only if the row is not categorical
await ExplorePrefixes(cancellationToken);
}
}

private async Task<IEnumerable<Prefix>> ExplorePrefixes(CancellationToken cancellationToken)
{
var allPrefixes = new List<Prefix>();
var length = 0;
while (true)
{
length++;
var prefixesQ = await ResolveQuery<TextColumnPrefix.Result>(
new TextColumnPrefix(TableName, ColumnName, length),
cancellationToken);

var counts = prefixesQ.ResultRows.CountTotalAndSuppressed();
var avgCount = (double)counts.NonSuppressedCount / counts.NonSuppressedRows;

var prefixes =
from row in prefixesQ.ResultRows
let frequency = (double)row.Count / counts.NonSuppressedCount
where row.HasValue && row.Count > avgCount
orderby frequency descending
select new Prefix(row.Prefix, frequency);

if (!prefixes.Any())
{
break;
}

if (length > prefixes.Max(p => p.Value.Length))
{
break;
}

allPrefixes.AddRange(prefixes);
}

var ret =
from row in allPrefixes
orderby row.Value.Length ascending, row.Frequency descending
select row;

PublishMetric(new UntypedMetric(name: "text.prefixes", metric: ret));

return ret;
}

private struct Prefix
{
public Prefix(string value, double frequency)
{
Value = value;
Frequency = frequency;
}

public string Value { get; }

public double Frequency { get; }
}
}
}
47 changes: 41 additions & 6 deletions src/explorer.api/Extensions/DiffixExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,47 @@ namespace Explorer.Diffix.Extensions

internal static class DiffixExtensions
{
public static (long Total, long Suppressed) CountTotalAndSuppressed<T>(this IEnumerable<T> valueCounts)
where T : ICountAggregate, INullable, ISuppressible
public static CountResultType CountTotalAndSuppressed<T>(this IEnumerable<T> valueCounts)
where T : ICountAggregate, ISuppressible
=> valueCounts.Aggregate(
(0L, 0L),
(acc, next) => (
acc.Item1 + next.Count,
acc.Item2 + (next.IsSuppressed ? next.Count : 0L)));
default(CountResultType),
(acc, row) => new CountResultType(acc, row.Count, row.IsSuppressed));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A constructor that exists merely to increment the count isn't very intuitive. And it seems wasteful to create a new value at every iteration of the loop? (maybe it doesn't matter since it's a value type?)

Would it be better / more idiomatic here to have a member function that updates the counter in-place? Something like:

public CountResultType Accumulate<T>(T row)
where T : ICountAggregate, ISuppressible
=> new CountResultType
{
    TotalCount = TotalCount + row.Count,
    TotalRows = TotalRows + 1,
    SuppressedCount = SuppressedCount + (row.IsSuppressed ? row.Count : 0),
    SuppressedRows = SuppressedRows + (row.IsSuppressed ? 1 : 0), 
};

Then we could write

(acc, row) => acc.Accumulate(row)

Or otherwise make the member function static:

(acc, row) => CountResultType.Accumulate(acc, row)

What do you think?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I will try to fix it in a separate PR.

}

#pragma warning disable CA1815 // Struct type should override Equals
#pragma warning disable SA1201 // A struct should not follow a class
public struct CountResultType
{
public CountResultType(CountResultType cr, long count, bool isSuppressed)
{
TotalCount = cr.TotalCount + count;
TotalRows = cr.TotalRows + 1;
if (isSuppressed)
{
SuppressedCount = cr.SuppressedCount + count;
SuppressedRows = cr.SuppressedRows + 1;
}
else
{
SuppressedCount = cr.SuppressedCount;
SuppressedRows = cr.SuppressedRows;
}
}

public long TotalCount { get; }

public long SuppressedCount { get; }

public long TotalRows { get; }

public long SuppressedRows { get; }

public long NonSuppressedRows => TotalRows - SuppressedRows;

public long NonSuppressedCount => TotalCount - SuppressedCount;

public double SuppressedCountRatio => (double)SuppressedCount / TotalCount;
}
#pragma warning restore CA1815 // Struct type should override Equals
#pragma warning restore SA1201 // A struct should not follow a class
}
64 changes: 64 additions & 0 deletions src/explorer.api/Queries/TextColumnPrefix.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
namespace Explorer.Queries
{
using System.Linq;
using System.Text.Json;

using Aircloak.JsonApi;
using Aircloak.JsonApi.JsonConversion;
using Aircloak.JsonApi.ResponseTypes;

using Explorer.Diffix.Interfaces;

internal class TextColumnPrefix :
IQuerySpec<TextColumnPrefix.Result>
{
public TextColumnPrefix(string tableName, string columnName, int length)
{
// TODO: determine prefix length dynamically
TableName = tableName;
ColumnName = columnName;
Length = length;
}

public string QueryStatement => $@"
select
left({ColumnName}, {Length}),
count(*),
count_noise(*)
from {TableName}
group by 1
having length(left({ColumnName}, {Length})) = {Length}";

private string TableName { get; }

private string ColumnName { get; }

private int Length { get; }

public Result FromJsonArray(ref Utf8JsonReader reader) => new Result(ref reader);

public class Result : ICountAggregate, INullable, ISuppressible
{
private readonly AircloakValue<string> prefixColumn;

public Result(ref Utf8JsonReader reader)
{
prefixColumn = reader.ParseAircloakResultValue<string>();
Count = reader.ParseCount();
CountNoise = reader.ParseCountNoise();
}

public string Prefix => prefixColumn.HasValue ? prefixColumn.Value : string.Empty;

public long Count { get; set; }

public double? CountNoise { get; set; }

public bool IsNull => prefixColumn.IsNull;

public bool IsSuppressed => prefixColumn.IsSuppressed;

public bool HasValue => prefixColumn.HasValue;
}
}
}
8 changes: 4 additions & 4 deletions tests/explorer.api.tests/TextColumnTrimTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ public async void TestEmailPositive()
TestDataSource,
nameof(TextColumnTrimTests));

var (totalValueCount, suppressedValueCount) = query.ResultRows.CountTotalAndSuppressed();
var counts = query.ResultRows.CountTotalAndSuppressed();

var isEmail = totalValueCount == query.ResultRows
var isEmail = counts.TotalCount == query.ResultRows
.Where(r => r.TrimmedText == "@")
.Sum(r => r.Count);

Expand All @@ -42,9 +42,9 @@ public async void TestEmailNegative()
TestDataSource,
nameof(TextColumnTrimTests));

var (totalValueCount, suppressedValueCount) = query.ResultRows.CountTotalAndSuppressed();
var counts = query.ResultRows.CountTotalAndSuppressed();

var isEmail = totalValueCount == query.ResultRows
var isEmail = counts.TotalCount == query.ResultRows
.Where(r => r.TrimmedText == "@")
.Sum(r => r.Count);

Expand Down