Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pg_diffix/aggregation/count.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ typedef struct CountResult
uint32 noisy_top_count;
double noise_sd;
double noise;
bool not_enough_aidvs;
bool not_enough_aid_values;
} CountResult;

extern CountResult aggregate_count_contributions(
Expand Down
2 changes: 0 additions & 2 deletions pg_diffix/aggregation/noise.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@

#include "pg_diffix/utils.h"

typedef hash_t seed_t;

/*
* Returns a uniform integer in the positive interval [min, max] for the given seed and step name.
*/
Expand Down
8 changes: 7 additions & 1 deletion pg_diffix/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
*/

typedef uint64 hash_t;
typedef hash_t seed_t;

static inline hash_t hash_bytes(const void *bytes, size_t size)
{
Expand Down Expand Up @@ -66,7 +67,12 @@ static inline List *hash_set_add(List *hash_set, hash_t hash)
return list_append_unique_ptr(hash_set, (void *)hash);
}

extern hash_t hash_set_combine(List *hash_set);
extern seed_t hash_set_to_seed(const List *hash_set);

static inline List *hash_set_union(List *dst_set, const List *src_set)
{
return list_concat_unique_ptr(dst_set, src_set);
}

/*-------------------------------------------------------------------------
* Compatibility shims
Expand Down
6 changes: 3 additions & 3 deletions src/aggregation/count.c
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ CountResult aggregate_count_contributions(

if (distinct_contributors < g_config.outlier_count_min + g_config.top_count_min)
{
result.not_enough_aidvs = true;
result.not_enough_aid_values = true;
return result;
}

Expand Down Expand Up @@ -145,7 +145,7 @@ static CountResult count_calculate_result(seed_t bucket_seed, const Contribution

void accumulate_count_result(CountResultAccumulator *accumulator, const CountResult *result)
{
Assert(result->not_enough_aidvs == false);
Assert(result->not_enough_aid_values == false);
Assert(result->flattening >= 0);

if (result->flattening > accumulator->max_flattening)
Expand Down Expand Up @@ -227,7 +227,7 @@ static Datum count_finalize(AnonAggState *base_state, Bucket *bucket, BucketDesc
ContributionTrackerState *contribution_tracker = (ContributionTrackerState *)lfirst(cell);
CountResult result = count_calculate_result(bucket_seed, contribution_tracker);

if (result.not_enough_aidvs)
if (result.not_enough_aid_values)
return Int64GetDatum(min_count);

accumulate_count_result(&result_accumulator, &result);
Expand Down
79 changes: 37 additions & 42 deletions src/aggregation/count_distinct.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
*/
typedef struct DistinctTrackerHashEntry
{
Datum value; /* Unique value */
List *aidvs; /* List of AID sets (represented as Lists), one for each AID instance */
char status; /* Required for hash table */
Datum value; /* Unique value */
List *aid_values_sets; /* List of AID sets, one for each AID instance */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

re the removed comment; to reiterate :)

I think it's hard to learn the type of the values held in this list, given the nesting and overall complexity of this module, I wish I had such a comment when first confronted with this code. It's actually even more pressing now, as the name would indicate it's some kind of a set internally.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand which part was removed and you think was valuable.
Would it be better reformulated as: "List of hashed AID values sets, one for each AID instance"?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the represented as Lists part, or, if the option to typedef ... <something> prevails (other thread), represented as <something>, unless after the typedef change it becomes much better self documenting, what type concretely is held inside aid_value_sets.

This is to fill in for the type-less-ness of List, which in the case of a nested List of Lists, becomes an unnecessary hurdle, when trying to understand the code (unless you're already familiar with it). Because of this, I'd even say that doing the typedef ... HashSet is worthwhile.

char status; /* Required for hash table */
} DistinctTrackerHashEntry;

/* Metadata needed for hashing and equality checks on the unique values. */
Expand Down Expand Up @@ -53,11 +53,11 @@ get_distinct_tracker_entry(DistinctTracker_hash *tracker, Datum value, int aids_
DistinctTrackerHashEntry *entry = DistinctTracker_insert(tracker, value, &found);
if (!found)
{
entry->aidvs = NIL;
entry->aid_values_sets = NIL;
entry->value = datumCopy(value, DATA(tracker)->typbyval, DATA(tracker)->typlen);
for (int i = 0; i < aids_count; i++)
{
entry->aidvs = lappend(entry->aidvs, NIL);
entry->aid_values_sets = lappend(entry->aid_values_sets, NIL);
}
}
return entry;
Expand All @@ -82,38 +82,26 @@ static void set_value_sorting_globals(Oid element_type)
g_compare_values_func = &g_compare_values_typentry->cmp_proc_finfo;
}

static seed_t seed_from_aidv(const List *aidvs)
static bool aid_set_is_high_count(seed_t bucket_seed, const List *aid_values_set)
{
seed_t seed = 0;
ListCell *cell;
foreach (cell, aidvs)
{
aid_t aid = (aid_t)lfirst(cell);
seed ^= aid;
}
return seed;
}

static bool aid_set_is_high_count(seed_t bucket_seed, const List *aidvs)
{
if (list_length(aidvs) < g_config.low_count_min_threshold)
if (list_length(aid_values_set) < g_config.low_count_min_threshold)
return false; /* Fewer AID values than minimum threshold, value is low-count. */

seed_t aid_seed = seed_from_aidv(aidvs);
seed_t aid_seed = hash_set_to_seed(aid_values_set);

seed_t seeds[] = {bucket_seed, aid_seed};
int threshold = generate_lcf_threshold(seeds, ARRAY_LENGTH(seeds));

return list_length(aidvs) >= threshold;
return list_length(aid_values_set) >= threshold;
}

static bool aid_sets_are_high_count(seed_t bucket_seed, const List *aidvs)
static bool aid_sets_are_high_count(seed_t bucket_seed, const List *aid_values_sets)
{
ListCell *cell;
foreach (cell, aidvs)
foreach (cell, aid_values_sets)
{
const List *aidv = (const List *)lfirst(cell);
if (!aid_set_is_high_count(bucket_seed, aidv))
const List *aid_values_set = (const List *)lfirst(cell);
if (!aid_set_is_high_count(bucket_seed, aid_values_set))
return false;
}
return true;
Expand All @@ -129,7 +117,7 @@ static List *filter_lc_entries(seed_t bucket_seed, DistinctTracker_hash *tracker
DistinctTrackerHashEntry *entry = NULL;
while ((entry = DistinctTracker_iterate(tracker, &it)) != NULL)
{
if (!aid_sets_are_high_count(bucket_seed, entry->aidvs))
if (!aid_sets_are_high_count(bucket_seed, entry->aid_values_sets))
lc_entries = lappend(lc_entries, entry);
}

Expand Down Expand Up @@ -205,13 +193,13 @@ static List *transpose_lc_values_per_aid(List *lc_entries, int aid_index, uint32
foreach (lc_entry_cell, lc_entries)
{
const DistinctTrackerHashEntry *entry = (const DistinctTrackerHashEntry *)lfirst(lc_entry_cell);
const List *aidvs = (const List *)list_nth(entry->aidvs, aid_index);
const List *aid_values_set = (const List *)list_nth(entry->aid_values_sets, aid_index);

if (aidvs != NIL) /* Count unique value only if it has at least one associated AID value. */
if (aid_values_set != NIL) /* Count unique value only if it has at least one associated AID value. */
(*lc_values_true_count)++;

ListCell *aidv_cell;
foreach (aidv_cell, aidvs)
foreach (aidv_cell, aid_values_set)
{
aid_t aid = (aid_t)lfirst(aidv_cell);
per_aid_values = associate_value_with_aid(per_aid_values, aid, entry->value);
Expand Down Expand Up @@ -362,7 +350,7 @@ static CountDistinctResult count_distinct_calculate_final(CountDistinctState *st
list_free_deep(per_aid_values);
pfree(top_contributors);

if (inner_count_result.not_enough_aidvs)
if (inner_count_result.not_enough_aid_values)
{
insufficient_data = true;
break;
Expand Down Expand Up @@ -452,11 +440,11 @@ static void count_distinct_merge(AnonAggState *dst_base_state, const AnonAggStat

ListCell *dst_cell = NULL;
const ListCell *src_cell = NULL;
forboth(dst_cell, dst_entry->aidvs, src_cell, src_entry->aidvs)
forboth(dst_cell, dst_entry->aid_values_sets, src_cell, src_entry->aid_values_sets)
{
List **dst_aidv = (List **)&lfirst(dst_cell);
const List **src_aidv = (const List **)&lfirst(src_cell);
*dst_aidv = list_concat_unique_ptr(*dst_aidv, *src_aidv);
List **dst_aid_values_set = (List **)&lfirst(dst_cell);
const List **src_aid_values_set = (const List **)&lfirst(src_cell);
*dst_aid_values_set = hash_set_union(*dst_aid_values_set, *src_aid_values_set);
}
}

Expand All @@ -468,6 +456,16 @@ static const char *count_distinct_explain(const AnonAggState *base_state)
return "diffix.anon_count_distinct";
}

static List *add_aid_value_to_set(List *aid_values_set, NullableDatum aid_arg, Oid aid_type)
{
if (!aid_arg.isnull)
{
aid_t aid_value = get_aid_descriptor(aid_type).make_aid(aid_arg.value);
aid_values_set = hash_set_add(aid_values_set, aid_value);
}
return aid_values_set;
}

static void count_distinct_transition(AnonAggState *base_state, int num_args, NullableDatum *args)
{
CountDistinctState *state = (CountDistinctState *)base_state;
Expand All @@ -482,18 +480,15 @@ static void count_distinct_transition(AnonAggState *base_state, int num_args, Nu
DistinctTrackerHashEntry *entry = get_distinct_tracker_entry(state->tracker, value, aids_count);

ListCell *cell;
foreach (cell, entry->aidvs)
foreach (cell, entry->aid_values_sets)
{
int aid_index = foreach_current_index(cell) + AIDS_OFFSET;
if (!args[aid_index].isnull)
{
Oid aid_type = state->args_desc->args[aid_index].type_oid;
aid_t aid = get_aid_descriptor(aid_type).make_aid(args[aid_index].value);
List **aidv = (List **)&lfirst(cell);
*aidv = list_append_unique_ptr(*aidv, (void *)aid);
}
Oid aid_type = state->args_desc->args[aid_index].type_oid;
List **aid_values_set = (List **)&lfirst(cell);
*aid_values_set = add_aid_value_to_set(*aid_values_set, args[aid_index], aid_type);
}
}

MemoryContextSwitchTo(old_context);
}

Expand Down
4 changes: 2 additions & 2 deletions src/query/anonymization.c
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,7 @@ static void prepare_bucket_seeds(Query *query)
seed_material_hash_set = hash_set_add(seed_material_hash_set, seed_material_hash);
}

g_sql_seed = hash_set_combine(seed_material_hash_set);
g_sql_seed = hash_set_to_seed(seed_material_hash_set);

list_free(seed_material_hash_set);
}
Expand Down Expand Up @@ -576,7 +576,7 @@ seed_t compute_bucket_seed(const Bucket *bucket, const BucketDescriptor *bucket_
label_hash_set = hash_set_add(label_hash_set, label_hash);
}

seed_t bucket_seed = g_sql_seed ^ hash_set_combine(label_hash_set);
seed_t bucket_seed = g_sql_seed ^ hash_set_to_seed(label_hash_set);

list_free(label_hash_set);

Expand Down
4 changes: 2 additions & 2 deletions src/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@

#include "pg_diffix/utils.h"

hash_t hash_set_combine(List *hash_set)
seed_t hash_set_to_seed(const List *hash_set)
{
ListCell *cell = NULL;
hash_t accumulator = 0;
seed_t accumulator = 0;
foreach (cell, hash_set)
{
hash_t hash = (hash_t)lfirst(cell);
Expand Down