Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Code for the Location Heatmaps paper. #47

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
104 changes: 64 additions & 40 deletions analytics/location_heatmaps/geo_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import dataclasses
import random
from typing import List, Any
from scipy.stats import norm

import numpy as np
import pygtrie
Expand All @@ -32,7 +33,16 @@
DEFAULT_CHILDREN = ['00', '01', '10', '11']


def get_default_children(aux_data, split=None):
def get_default_children(aux_data=False, split=None):
"""Returns a quad tree first 4 nodes. If aux_data (boolean) provided expands
to 2 more bits or a specific pos/neg nodes.
Args:
aux_data: a boolean to use additional bit for data, e.g. pos/neg.
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO aux_data sounds like an actual data object rather than a boolean parameter; I would prefer a name like "has_aux_data" or even "has_aux_bit" since a single bit is all that's supported here. This also goes for other usages of "aux_data" as a boolean in other functions, below.

Really it would be ideal to just generalize this to support an arbitrary number of extra bits with an automatic encoding from the value specified in "split", rather than a single extra bit with a predefined 'pos'-->1 and 'neg'-->0 encoding, but I understand that is probably out of scope at present.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, let me change it to has_aux_bit for now, and maybe can expand it later

split: specific subset of aux_data (pos/neg).

Returns:
A list of nodes to initialize the tree.
"""
if aux_data:
if split == 'pos':
return ['001', '011', '101', '111']
Expand Down Expand Up @@ -85,16 +95,16 @@ def coordinates_to_binary_path(xy_tuple, depth=10):
Returns:
binary version of the coordinate.
"""
aux_data = ''
if len(xy_tuple) == 2:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it'd be cleaner to just add another arg for 'pos', and maybe even split the x_coord and y_coord into separate args, but at the very least we should document in the docstring that xy_tuple can actually be an x, y, pos triplet.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this comment applies to aux_data now that that is being pulled from xy_tuple.

x_coord, y_coord = xy_tuple
aux_data = False
pos = ''
else:
x_coord, y_coord, pos = xy_tuple
x_coord, y_coord, aux_data = xy_tuple
path = ''
for j in reversed(range(depth)):
path += f'{(x_coord >> j) & 1}{(y_coord >> j) & 1}{pos}/'
path += f'{(x_coord >> j) & 1}{(y_coord >> j) & 1}{aux_data}/'
path = path[:-1]

return path


Expand Down Expand Up @@ -189,7 +199,7 @@ def transform_region_to_coordinates(x_coord,


def rebuild_from_vector(vector, tree, image_size, contour=False, split_threshold=0,
aux_data=False, count_min=False):
aux_data=False, count_min=None):
"""Using coordinate vector and the tree produce a resulting image.

For each value in the vector it finds the corresponding prefix and plots the
Expand Down Expand Up @@ -260,9 +270,9 @@ def rebuild_from_vector(vector, tree, image_size, contour=False, split_threshold
return current_image, pos_image, neg_image


def update_tree(prefix, tree, tree_prefix_list):
def append_to_tree(prefix, tree, tree_prefix_list):
"""
Update tree with new prefix
Append new node to the tree.
Args:
prefix: new path, e.g. '10/01/10'
tree: current tree
Expand All @@ -284,9 +294,9 @@ def split_regions(tree_prefix_list,
split_threshold,
image_bit_level,
collapse_threshold=None,
expand_all=False,
last_result: AlgResult = None,
count_min=None):
count_min=None,
print_output=False):
"""Modify the tree by splitting and collapsing the nodes.

This implementation collapses and splits nodes of the tree according to
Expand All @@ -299,23 +309,23 @@ def split_regions(tree_prefix_list,
split_threshold: threshold value used to split the nodes.
image_bit_level: stopping criteria once the final resolution is reached.
collapse_threshold: threshold value used to collapse the nodes.
expand_all: expand all regions,
last_result: use previous level results to compute conf intervals,
count_min: use count-min sketch
count_min: use count-min sketch.
print_output: print results of splitting.
Returns:
new_tree, new_tree_prefix_list, fresh_expand
new_tree, new_tree_prefix_list, num_newly_expanded_nodes
"""
collapsed = 0
created = 0
fresh_expand = 0
num_newly_expanded_nodes = 0
unchanged = 0

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

collapsed, created, and unchanged do not appear to be used for anything anymore. let's delete them xor do something with them.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

printing the results in the end of the function now

new_tree_prefix_list = list()
new_tree = pygtrie.StringTrie()
for i in range(len(tree_prefix_list)):
if count_min:
count = count_min.query(tree_prefix_list[i])
else:
count = vector_counts[i] if not expand_all else np.inf
count = vector_counts[i] if vector_counts else np.inf
prefix = tree_prefix_list[i]

# check whether the tree has reached the bottom
Expand All @@ -326,30 +336,32 @@ def split_regions(tree_prefix_list,
split_threshold)
else:
cond = count > split_threshold
if expand_all or cond:
if cond:
for child in DEFAULT_CHILDREN:
new_prefix = f'{prefix}/{child}'
fresh_expand += update_tree(new_prefix, new_tree, new_tree_prefix_list)
num_newly_expanded_nodes += append_to_tree(new_prefix, new_tree, new_tree_prefix_list)
else:
if collapse_threshold is not None and \
count <= collapse_threshold and len(prefix) > 2:
old_prefix = prefix[:-3]
collapsed += 1
created += update_tree(old_prefix, new_tree, new_tree_prefix_list)
created += append_to_tree(old_prefix, new_tree, new_tree_prefix_list)
else:
unchanged += update_tree(prefix, new_tree, new_tree_prefix_list)

return new_tree, new_tree_prefix_list, fresh_expand
unchanged += append_to_tree(prefix, new_tree, new_tree_prefix_list)
if print_output:
print(f'New: {num_newly_expanded_nodes}. Collapsed: {collapsed}. ' + \
f'Created from collapsed: {created}. Unchanged: {unchanged}.')
return new_tree, new_tree_prefix_list, num_newly_expanded_nodes


def split_regions_aux(tree_prefix_list,

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect that even more of these two functions could be shared (in particular, the basic structure of looping over prefixes and adding nodes to the tree as appropriate for the splitting & collapsing criteria), but acknowledge that it may not actually improve readability much more to do further surgery. Please consider sharing that prefix-looping structure, but if you can't see a clean and easy way to do so, that's fine.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, I agree, it's just that I need to look at both bits in data that is hard to unify. Maybe once we go to multiple dimensions we can just unify everything.

vector_counts,
split_threshold,
image_bit_level,
collapse_threshold=None,
expand_all=False,
last_result: AlgResult = None,
count_min=None):
count_min=None,
print_output=False):
"""Use expansion with aux data.

We check both counts for positive and negative attributes for each location.
Expand All @@ -360,24 +372,24 @@ def split_regions_aux(tree_prefix_list,
split_threshold: threshold value used to split the nodes.
image_bit_level: stopping criteria once the final resolution is reached.
collapse_threshold: threshold value used to collapse the nodes.
expand_all: expand all regions,
last_result: use previous level results to compute conf intervals,
count_min: use count-min sketch
print_output: print results of splitting
Returns:
new_tree, new_tree_prefix_list, fresh_expand
new_tree, new_tree_prefix_list, num_newly_expanded_nodes
"""
new_tree_prefix_list = list()
new_tree = pygtrie.StringTrie()
collapsed = 0
created = 0
fresh_expand = 0
num_newly_expanded_nodes = 0
unchanged = 0

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

likewise re. collapsed, created, and unchanged being unused

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


for i in range(0, len(tree_prefix_list), 2):
if count_min:
raise ValueError('CountMin is not implemented for Aux data.')
neg_count = vector_counts[i] if not expand_all else np.inf
pos_count = vector_counts[i + 1] if not expand_all else np.inf
neg_count = vector_counts[i] if vector_counts else np.inf
pos_count = vector_counts[i + 1] if vector_counts else np.inf
neg_prefix = tree_prefix_list[i]
pos_prefix = tree_prefix_list[i + 1]

Expand All @@ -392,28 +404,30 @@ def split_regions_aux(tree_prefix_list,
cond = p_cond and n_cond
else:
cond = (pos_count > split_threshold and neg_count > split_threshold)
if expand_all or cond:
if cond:
neg_child = get_default_children(aux_data=True, split='neg')
pos_child = get_default_children(aux_data=True, split='pos')
for j in range(len(pos_child)):
new_prefix = f'{neg_prefix}/{neg_child[j]}'
fresh_expand += update_tree(new_prefix, new_tree, new_tree_prefix_list)
num_newly_expanded_nodes += append_to_tree(new_prefix, new_tree, new_tree_prefix_list)
new_prefix = f'{pos_prefix}/{pos_child[j]}'
update_tree(new_prefix, new_tree, new_tree_prefix_list)
append_to_tree(new_prefix, new_tree, new_tree_prefix_list)
else:
if collapse_threshold is not None and \
(pos_count < collapse_threshold or neg_count < collapse_threshold) \
and len(pos_prefix) > 3 and len(neg_prefix) > 3:
old_prefix = neg_prefix[:-4]
collapsed += 1
created += update_tree(old_prefix, new_tree, new_tree_prefix_list)
created += append_to_tree(old_prefix, new_tree, new_tree_prefix_list)
old_prefix = pos_prefix[:-4]
update_tree(old_prefix, new_tree, new_tree_prefix_list)
append_to_tree(old_prefix, new_tree, new_tree_prefix_list)
else:
unchanged += update_tree(neg_prefix, new_tree, new_tree_prefix_list)
update_tree(pos_prefix, new_tree, new_tree_prefix_list)

return new_tree, new_tree_prefix_list, fresh_expand
unchanged += append_to_tree(neg_prefix, new_tree, new_tree_prefix_list)
append_to_tree(pos_prefix, new_tree, new_tree_prefix_list)
if print_output:
print(f'New: {num_newly_expanded_nodes}. Collapsed: {collapsed}. ' + \
f'Created from collapsed: {created}. Unchanged: {unchanged}.')
return new_tree, new_tree_prefix_list, num_newly_expanded_nodes


def build_from_sample(samples, total_size):
Expand Down Expand Up @@ -500,8 +514,6 @@ def convert_to_dataset(image, total_size, value=None):


def compute_conf_intervals(sum_vector: np.ndarray, level=95):
from scipy.stats import norm

conf_intervals = dict()
conf_interval_weighted = dict()
z = norm.ppf(1-(1-level/100)/2)
Expand All @@ -521,7 +533,19 @@ def compute_conf_intervals(sum_vector: np.ndarray, level=95):
return conf_intervals, conf_interval_weighted


def create_confidence_interval_condition(last_result, prefix, count, split_threshold):
def evaluate_confidence_interval_condition(last_result, prefix, count, split_threshold):
"""Evaluate whether the confidence interval is smaller than the the threshold.
We compute confidence interval by comparing a current value in a sub-region
with its parent region value from the previous level
Args:
last_result: a previous level tree results and vector counts
prefix: current node prefix.
count: current node count.
split_threshold: threshold to cutoff confidence interval.

Returns:
whether the node satisfies confidence interval threshold.
"""

(last_prefix, last_prefix_pos) = last_result.tree.longest_prefix(prefix)
if last_prefix is None:
Expand Down
47 changes: 23 additions & 24 deletions analytics/location_heatmaps/run_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@
import mechanisms
import metrics
import plotting
from sketches import get_count_min_sketch
from sketches import CountMinSketch
from config import Config

TOPK = 1000
TOTAL_SIZE = 1024


def get_data(path, crop_tuple=(512, 100, 1536, 1124),
total_size=1024, save=True):
total_size=1024, save=True, dataset_name='dataset.npy'):
"""Download the map image.

Downloads the image from a given path, crops it and transforms into a list
Expand All @@ -61,20 +61,16 @@ def get_data(path, crop_tuple=(512, 100, 1536, 1124),
image = Image.open(f).convert('L')
image = image.crop(crop_tuple)
true_image = np.asarray(image)
if os.path.isfile('dataset.npy'):
dataset = np.load('dataset.npy')
if os.path.isfile(dataset_name):
dataset = np.load(dataset_name)
else:
dataset = geo_utils.convert_to_dataset(true_image, total_size)
if save:
np.save('dataset', dataset)
np.save(dataset_name, dataset)

return true_image, dataset


def get_split_data(path):
dataset = np.load(path)


def print_output(text, flag):
"""Simple flag to suppress output."""

Expand Down Expand Up @@ -106,8 +102,8 @@ def run_experiment(true_image,
start_with_level=0,
ignore_start_eps=False,
last_result_ci=None,
count_min=False) -> List[geo_utils.AlgResult]:
"""The main method to run an experiment using TrieHH.
count_min=None) -> List[geo_utils.AlgResult]:
""" The main method to run the experiments.

Args:
true_image: original image for comparison
Expand Down Expand Up @@ -136,7 +132,7 @@ def run_experiment(true_image,
start_with_level: skip first levels and always expand them.
ignore_start_eps: ignore spending epsilon when using start_with_level.
last_result_ci: for two label save previous results.
count_min: use count-min sketch.
count_min: to use count-min sketch use dict: {'depth': 20, 'width': 4000}

Returns:
A list of per level geo_utls.AlgResult objects.
Expand Down Expand Up @@ -167,7 +163,7 @@ def run_experiment(true_image,
tree, tree_prefix_list = geo_utils.init_tree(config.aux_data)
per_level_results = list()
per_level_grid = list()
fresh_expand = None
num_newly_expanded_nodes = None
sum_vector = None
print_output(f'aux_data: {config.aux_data}', config.output_flag)
process_split = geo_utils.split_regions_aux if aux_data else geo_utils.split_regions
Expand All @@ -181,13 +177,15 @@ def run_experiment(true_image,
# define DP round size
dp_round_size = config.min_dp_size if config.min_dp_size else config.secagg_round_size
if config.split_threshold and config.split_threshold_func:
raise ValueError('Specify either `threshold` or `threshold_func`.')
raise ValueError('Specify either `threshold` xor `threshold_func`.')
if collapse_threshold and collapse_func:
raise ValueError(
'Specify either `collapse_threshold` or `collapse_func`.')
'Specify either `collapse_threshold` xor `collapse_func`.')

# sample devices that will participate in the algorithm (same across levels):
samples = np.random.choice(dataset, config.level_sample_size, replace=False)
if count_min:
count_min_sketch = get_count_min_sketch(depth=20, width=2000)
if count_min is not None:
count_min_sketch = CountMinSketch(depth=count_min['depth'], width=count_min['width'])
sensitivity = 20
else:
count_min_sketch = None
Expand Down Expand Up @@ -217,7 +215,7 @@ def run_experiment(true_image,
# prevent spilling over the budget
if remaining_budget:
# last round, no progress in tree, or cannot run at least two rounds.
if i == max_levels - 1 or fresh_expand == 0 \
if i == max_levels - 1 or num_newly_expanded_nodes == 0 \
or remaining_budget < 2 * eps * samples_len:
print_output(
'Last round. Spending remaining epsilon budget: ' + \
Expand All @@ -227,6 +225,7 @@ def run_experiment(true_image,
noiser = noise_class(dp_round_size, sensitivity, eps)
if ignore_start_eps and start_with_level <= i:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a comment here or in the documentation for 'ignore_start_eps' explaining the motivation? If the idea is to only start accounting at level 'start_with_level', shouldn't the second condition be reversed such that spent budget is ignored for the levels before 'start_with_level'?

Apologies if I'm just totally misreading this.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated and added into docstring

print_output('Ignoring eps spent', flag=output_flag)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: this is a frightening message; it would be nice to have a bit of extra context here (e.g., "Ignoring epsilon spent expanding first {start_with_level} levels, including current level {i}.").

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

spent_budget = 0
else:
spent_budget += eps * samples_len

Expand All @@ -242,12 +241,12 @@ def run_experiment(true_image,

# to prevent OOM errors we use vectors of size partial.
if start_with_level > i:
tree, tree_prefix_list, fresh_expand = process_split(
tree, tree_prefix_list, num_newly_expanded_nodes = process_split(
tree_prefix_list=tree_prefix_list,
vector_counts=None,
split_threshold=split_threshold, image_bit_level=10,
split_threshold=-np.inf, image_bit_level=10,
collapse_threshold=collapse_threshold,
expand_all=True, count_min=count_min)
count_min=count_min, print_output=output_flag)
print_output(f"Expanding all at the level: {i}.", output_flag)
continue

Expand Down Expand Up @@ -287,12 +286,12 @@ def run_experiment(true_image,
else:
last_result = per_level_results[i - 1]

tree, tree_prefix_list, fresh_expand = process_split(
tree, tree_prefix_list, num_newly_expanded_nodes = process_split(
tree_prefix_list=result.tree_prefix_list, vector_counts=result.sum_vector,
split_threshold=split_threshold, image_bit_level=10,
collapse_threshold=collapse_threshold,
last_result=last_result)
if fresh_expand==0:
last_result=last_result, print_output=output_flag)
if num_newly_expanded_nodes==0:
break
if output_flag:
print(f'Total epsilon-users: {spent_budget:.2f} with ' + \
Expand Down
Loading