In [1]:
from cookiemonster.epsilon_calculator import (
    get_epsilon_from_accuracy_for_counts,
    get_epsilon_for_high_probability_relative_error_wrt_prior,
    get_epsilon_for_relative_rmse_wrt_prior
)

In [2]:
import numpy as np
import plotly.express as px

In [6]:
batch_size = 20_000
estimated_average_count = 0.005
sensitivity = 5
true_query_result = 100

expected_result = estimated_average_count * batch_size

print(f"Expected result (prior): {expected_result}, true query result: {true_query_result}")


epsilon = get_epsilon_for_high_probability_relative_error_wrt_prior(
    sensitivity=sensitivity,
    expected_result=expected_result,
    relative_error=0.05,
    failure_probability=0.01
)

n_repetitions = 100
repeated_noisy_results = true_query_result*np.ones(n_repetitions) + np.random.laplace(scale=sensitivity / epsilon, size=n_repetitions)
px.scatter(repeated_noisy_results, range_y=[0,200])

Expected result (prior): 100.0, true query result: 100


In [4]:
batch_size = 20_000
estimated_average_count = 0.05
sensitivity = 5
true_query_result = 100

expected_result = estimated_average_count * batch_size

# If we overestimate the true result, we get noisy results!

print(f"Expected result (prior): {expected_result}, true query result: {true_query_result}")


epsilon = get_epsilon_for_high_probability_relative_error_wrt_prior(
    sensitivity=sensitivity,
    expected_result=expected_result,
    relative_error=0.05,
    failure_probability=0.01
)

n_repetitions = 100
repeated_noisy_results = true_query_result*np.ones(n_repetitions) + np.random.laplace(scale=sensitivity / epsilon, size=n_repetitions)
px.scatter(repeated_noisy_results, range_y=[0,200])

Expected result (prior): 1000.0, true query result: 100


In [5]:
batch_size = 20_000
estimated_average_count = 1
sensitivity = 5
true_query_result = 100

expected_result = estimated_average_count * batch_size

# The Turbo formula is implicitly assuming the average will be 1. 
# That can be a huge overestimation, if the true result happens to be low.
print(f"Expected result (prior): {expected_result}, true query result: {true_query_result}")

epsilon = get_epsilon_for_high_probability_relative_error_wrt_prior(
    sensitivity=sensitivity,
    expected_result=expected_result,
    relative_error=0.05,
    failure_probability=0.01
)

n_repetitions = 100
repeated_noisy_results = true_query_result*np.ones(n_repetitions) + np.random.laplace(scale=sensitivity / epsilon, size=n_repetitions)
px.scatter(repeated_noisy_results, range_y=[0,200])

Expected result (prior): 20000, true query result: 100


In [6]:
batch_size = 20_000
estimated_average_count = 1
sensitivity = 5
true_query_result = 100

expected_result = estimated_average_count * batch_size

# The Turbo formula is implicitly assuming the average will be 1. 
# That can be a huge overestimation, if the true result happens to be low.
print(f"Expected result (prior): {expected_result}, true query result: {true_query_result}")

epsilon = get_epsilon_from_accuracy_for_counts(
    cap_value=sensitivity,
    n=batch_size,
    a=0.05,
    b=0.01
)

n_repetitions = 100
repeated_noisy_results = true_query_result*np.ones(n_repetitions) + np.random.laplace(scale=sensitivity / epsilon, size=n_repetitions)
px.scatter(repeated_noisy_results, range_y=[0,200])

Expected result (prior): 20000, true query result: 100


In [7]:
batch_size = 20_000
estimated_average_count = 0.005
sensitivity = 5
true_query_result = 100

expected_result = estimated_average_count * batch_size

# RMSE works the same as the high proba bound, but needs only one parameter
# (implicitly fixes beta = exp(-srt(2)) = 0.25 which is pretty big! So would need to take smaller absolute error to compensate)
# Maybe simpler to use RMSE to combine bias and variance.

print(f"Expected result (prior): {expected_result}, true query result: {true_query_result}")


epsilon = get_epsilon_for_relative_rmse_wrt_prior(
    sensitivity=sensitivity,
    expected_result=expected_result,
    relative_error=0.05,
)

n_repetitions = 100
repeated_noisy_results = true_query_result*np.ones(n_repetitions) + np.random.laplace(scale=sensitivity / epsilon, size=n_repetitions)
px.scatter(repeated_noisy_results, range_y=[0,200])

Expected result (prior): 100.0, true query result: 100


In [21]:
for batch_size in range(300, 580, 10):
    estimated_average_count = 2
    sensitivity = 5

    expected_result = estimated_average_count * batch_size

    # print(f"Expected result (prior): {expected_result}, true query result: {true_query_result}")


    epsilon = get_epsilon_for_high_probability_relative_error_wrt_prior(
        sensitivity=sensitivity,
        expected_result=expected_result,
        relative_error=0.05,
        failure_probability=0.01
    )
    print(f"batch_size: {batch_size}, epsilon: {epsilon}")

batch_size: 300, epsilon: 0.7675283643313487
batch_size: 310, epsilon: 0.742769384836789
batch_size: 320, epsilon: 0.7195578415606394
batch_size: 330, epsilon: 0.6977530584830443
batch_size: 340, epsilon: 0.6772309097041312
batch_size: 350, epsilon: 0.657881455141156
batch_size: 360, epsilon: 0.6396069702761239
batch_size: 370, epsilon: 0.6223202954037962
batch_size: 380, epsilon: 0.605943445524749
batch_size: 390, epsilon: 0.5904064341010374
batch_size: 400, epsilon: 0.5756462732485115
batch_size: 410, epsilon: 0.5616061202424503
batch_size: 420, epsilon: 0.5482345459509633
batch_size: 430, epsilon: 0.5354849053474525
batch_size: 440, epsilon: 0.5233147938622832
batch_size: 450, epsilon: 0.5116855762208992
batch_size: 460, epsilon: 0.5005619767378361
batch_size: 470, epsilon: 0.4899117219136268
batch_size: 480, epsilon: 0.47970522770709295
batch_size: 490, epsilon: 0.46991532510082573
batch_size: 500, epsilon: 0.4605170185988092
batch_size: 510, epsilon: 0.45148727313608744
batch_size