/
select_partition.go
435 lines (402 loc) · 16.9 KB
/
select_partition.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
//
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package dpagg
import (
"fmt"
"math"
"github.com/google/differential-privacy/go/v3/checks"
"github.com/google/differential-privacy/go/v3/noise"
"github.com/google/differential-privacy/go/v3/rand"
)
// PreAggSelectPartition is used to compute an (ε,δ)-differentially private decision
// of whether to materialize a partition.
//
// Many differential privacy mechanisms work by performing an aggregation and
// adding noise. They achieve (ε_m,δ_m)-differential privacy under the
// assumption that partitions are chosen in advance. In other words, they assume that
// even if no data is associated with a partition, noise is added to the empty
// aggregation, and the noisy result is materialized. However, when only
// partitions containing data are materialized, such mechanisms fail to protect
// privacy for partitions containing data from a single privacy ID (e.g.,
// user). To fix this, partitions with small numbers of privacy IDs must
// sometimes be dropped in order to maintain privacy. This process of partition
// selection incurs an additional (ε,δ) differential privacy budget resulting
// in a total differential privacy budget of (ε+ε_m,δ+δ_m) being used for the
// aggregation with partition selection.
//
// Depending on the l0sensitivity, the PreAggSelectPartition uses one of two
// differentially private partition selection algorithms.
//
// When l0sensitivity ≤ 3, the partition selection process is made (ε,δ)
// differentially private by applying the definition of differential privacy to
// the count of privacy IDs. Supposing l0Sensitivity bounds the number of partitions
// a privacy ID may contribute to, we define:
//
// pε := ε/l0Sensitivity
// pδ := δ/l0Sensitivity
//
// to be the per-partition differential privacy losses incurred by the partition
// selection process. Letting n denote the number of privacy IDs in a partition,
// the probability of selecting a partition is given by the following recurrence
// relation:
//
// keepPartitionProbability(n) = min(
// keepPartitionProbability(n-1) * exp(pε) + pδ, (1)
// 1 - exp(-pε) * (1-keepPartitionProbability(n-1)-pδ), (2)
// 1 (3)
// )
//
// with base case keepPartitionProbability(0) = 0. This formula is optimal in terms of
// maximizing the likelihood of selecting a partition under (ε,δ)-differential
// privacy, with the caveat that the input values for pε and pδ are lower bound
// approximations. For efficiency, we use a closed-form solution to this
// recurrence relation. See [Differentially private partition selection paper]
// https://arxiv.org/pdf/2006.03684.pdf for details on the underlying mathematics.
//
// When l0sensitivity > 3, the partition selection process is made (ε,δ)
// differentially private by using the ThresholdedResult() of the Count primitive
// with Gaussian noise. Count computes a (ε,δ/2) differentially private count of
// the privacy IDs in a partition by adding Gaussian noise. Then, it computes
// a threshold T for which the probability that a (ε,δ/2) differentially private
// count of a single privacy ID can exceed T is δ/2. It keeps the partition iff
// differentially private count exceeds the threshold.
//
// The reason two different algorithms for deciding whether to keep a partition
// is used is because the first algorithm ("magic partition selection") is optimal
// when l0sensitivity ≤ 3 but is outperformed by Gaussian-based thresholding when
// l0sensitivity > 3.
//
// PreAggSelectPartition is a utility for maintaining the count of IDs in a single
// partition and then determining whether the partition should be
// materialized. Use Increment() to increment the count of IDs and ShouldKeepPartition() to decide
// if the partition should be materialized.
//
// PreAggSelectPartition also supports doing additional pre-thresholding on top of the
// differentially private partition selection via the PreThreshold in PreAggSelectPartitionOptions.
//
// See https://github.com/google/differential-privacy/blob/main/common_docs/pre_thresholding.md
// for more information.
type PreAggSelectPartition struct {
// parameters
epsilon float64
delta float64
l0Sensitivity int64
preThreshold int64
// State variables
// idCount is the count of unique privacy IDs in the partition.
idCount int64
state aggregationState
}
func preAggSelectPartitionEquallyInitialized(s1, s2 *PreAggSelectPartition) bool {
return s1.epsilon == s2.epsilon &&
s1.delta == s2.delta &&
s1.l0Sensitivity == s2.l0Sensitivity &&
s1.state == s2.state &&
s1.preThreshold == s2.preThreshold
}
// PreAggSelectPartitionOptions is used to set the privacy parameters when
// constructing a PreAggSelectPartition.
type PreAggSelectPartitionOptions struct {
// Epsilon and Delta specify the (ε,δ)-differential privacy budget used for
// partition selection. Required.
Epsilon float64
Delta float64
// An additional thresholding that is performed in combination with private
// partition selection to ensure that each partition has at least a
// PreThreshold number of unique contributions.
//
// See https://github.com/google/differential-privacy/blob/main/common_docs/pre_thresholding.md
// for more information.
// Optional.
PreThreshold int64
// MaxPartitionsContributed is the number of distinct partitions a single
// privacy unit can contribute to. Required.
MaxPartitionsContributed int64
}
// NewPreAggSelectPartition constructs a new PreAggSelectPartition from opt.
func NewPreAggSelectPartition(opt *PreAggSelectPartitionOptions) (*PreAggSelectPartition, error) {
if opt == nil {
opt = &PreAggSelectPartitionOptions{} // Prevents panicking due to a nil pointer dereference.
}
if err := checks.CheckPreThreshold(opt.PreThreshold); err != nil {
return nil, fmt.Errorf("NewPreAggSelectPartition: %w", err)
}
// Set PreThreshold to default 1 if not specified.
if opt.PreThreshold < 1 {
opt.PreThreshold = 1
}
s := PreAggSelectPartition{
epsilon: opt.Epsilon,
delta: opt.Delta,
preThreshold: opt.PreThreshold,
l0Sensitivity: opt.MaxPartitionsContributed,
}
if err := checks.CheckDeltaStrict(s.delta); err != nil {
return nil, fmt.Errorf("NewPreAggSelectPartition: %v", err)
}
// ε=0 is theoretically acceptable, but in practice it's probably an error,
// so we do not accept it as argument.
if err := checks.CheckEpsilonStrict(s.epsilon); err != nil {
return nil, fmt.Errorf("NewPreAggSelectPartition: %v", err)
}
if err := checks.CheckL0Sensitivity(s.l0Sensitivity); err != nil {
return nil, fmt.Errorf("NewPreAggSelectPartition: %v", err)
}
return &s, nil
}
// Increment increments the ids count by one.
// The caller must ensure this method is called at most once per privacy ID.
func (s *PreAggSelectPartition) Increment() error {
return s.IncrementBy(1)
}
// IncrementBy increments the ids count by the given value.
// Note that this shouldn't be used to count multiple contributions to a
// single partition from the same privacy unit.
//
// It could, for example, be used to increment the ids count by k privacy
// units at once.
//
// Note that decrementing counts by inputting a negative value is allowed,
// for example if you want to remove some users you have previously added.
func (s *PreAggSelectPartition) IncrementBy(count int64) error {
if s.state != defaultState {
return fmt.Errorf("PreAggSelectPartition cannot be amended: %v", s.state.errorMessage())
}
s.idCount += count
return nil
}
// Merge merges s2 into s (i.e., add the idCount of s2 to s). This implicitly
// assumes that s and s2 act on distinct privacy IDs. s2 is consumed by this
// operation: s2 may not be used after it is merged into s.
//
// Preconditions: s and s2 must have the same privacy parameters. In addition,
// ShouldKeepPartition() may not be called yet for either s or s2.
func (s *PreAggSelectPartition) Merge(s2 *PreAggSelectPartition) error {
if err := checkMergePreAggSelectPartition(s, s2); err != nil {
return err
}
s.idCount += s2.idCount
s2.state = merged
return nil
}
func checkMergePreAggSelectPartition(s1, s2 *PreAggSelectPartition) error {
if s1.state != defaultState {
return fmt.Errorf("checkMergePreAggSelectPartition: s1 cannot be merged with another PreAggSelectPartition instance: %v", s1.state.errorMessage())
}
if s2.state != defaultState {
return fmt.Errorf("checkMergePreAggSelectPartition: s2 cannot be merged with another PreAggSelectPartition instance: %v", s2.state.errorMessage())
}
if !preAggSelectPartitionEquallyInitialized(s1, s2) {
return fmt.Errorf("checkMergePreAggSelectPartition: s1 and s2 are not compatible")
}
return nil
}
// ShouldKeepPartition returns whether the partition should be materialized.
func (s *PreAggSelectPartition) ShouldKeepPartition() (bool, error) {
if s.state != defaultState {
return false, fmt.Errorf("PreAggSelectPartition's ShouldKeepPartition cannot be computed: %v", s.state.errorMessage())
}
s.state = resultReturned
// Pre-thresholding guarantees that at least this number of unique contributions are in the
// partition.
if s.idCount < s.preThreshold {
return false, nil
}
// PreThreshold is set to 1 as the default, subtract it here so it has no effect.
// This subtraction also ensures that idCount will always be > 0 if preThreshold = idsCount.
s.idCount = s.idCount - (s.preThreshold - 1)
if s.l0Sensitivity > 3 { // Gaussian thresholding outperforms in this case.
c, err := NewCount(&CountOptions{
Epsilon: s.epsilon,
Delta: s.delta / 2,
MaxPartitionsContributed: s.l0Sensitivity,
Noise: noise.Gaussian()})
if err != nil {
return false, fmt.Errorf("couldn't initialize count for PreAggSelectPartition: %v", err)
}
err = c.IncrementBy(s.idCount)
if err != nil {
return false, fmt.Errorf("couldn't increment count for PreAggSelectPartition: %v", err)
}
result, err := c.ThresholdedResult(s.delta / 2)
if err != nil {
return false, fmt.Errorf("couldn't compute thresholded result for PreAggSelectPartition: %v", err)
}
return result != nil, nil
}
prob, err := keepPartitionProbability(s.idCount, s.l0Sensitivity, s.epsilon, s.delta)
if err != nil {
return false, fmt.Errorf("couldn't compute keepPartitionProbability for PreAggSelectPartition: %v", err)
}
return rand.Uniform() < prob, nil
}
// sumExpPowers returns the evaluation of
//
// exp(minPower * ε) + exp((minPower+1) * ε) + ... + exp((numPowers+minPower-1) * ε)
//
// sumExpPowers requires ε >= 0. sumExpPowers may return +∞, but does not return
// NaN.
func sumExpPowers(epsilon float64, minPower, numPowers int64) (float64, error) {
if err := checks.CheckEpsilon(epsilon); err != nil {
return 0, fmt.Errorf("sumExpPowers: %v", err)
}
if numPowers <= 0 {
return 0, fmt.Errorf("sumExpPowers: numPowers (%d) must be > 0", numPowers)
}
// expEpsilonPow returns exp(a * epsilon).
expEpsilonPow := func(a int64) float64 {
return math.Exp(float64(a) * epsilon)
}
if math.IsInf(expEpsilonPow(minPower), 1) {
return math.Inf(1), nil
}
// In the case ε=0, sumExpPowers is simply numPowers. We use exp(-ε) = 1 to
// identify this case because our closed form solutions would otherwise
// result in division by 0 under finite precision arithmetic.
if math.Exp(-epsilon) == 1 {
return float64(numPowers), nil
}
// For the general case, we use a closed form solution to a geometric
// series. See https://en.wikipedia.org/wiki/Geometric_series#Sum.
//
// The typical closed form formula is:
// exp(minPower*ε) * (exp(numPowers*ε) - 1) / (exp(ε) - 1)
//
// In our setting, it is OK to return +∞ but not OK to return NaN. We use the
// following mathematically equivalent formulas to avoid returning NaN when
// using our finite precision arithmetic:
// exp((minPower-1)*ε) * (exp(numPowers*ε) - 1) / (1 - exp(-ε)) (1)
// (exp((numPowers+minPower-1)*ε) - exp((minPower-1)*ε)) / (1 - exp(-ε)) (2)
//
// We use (1) when minPower >= 1. In that case, (exp(numPowers*ε) - 1) is
// the only potentially infinite term. The other two terms satisfy
// exp((minPower-1) * ε) >= 1 and 0 < (1 - exp(-ε)) <= 1. The
// multiplication and division operations involving these other two
// terms increase the result. Thus, the result is never NaN, and +∞ is
// returned only when necessary.
//
// We use (2) when minPower < 1. In that case, 0 <= exp((minPower-1)*ε) < 1,
// so the numerator
// (exp((numPowers+minPower-1)*ε) - exp((minPower-1)*ε))
// is never NaN, and is only +∞ when necessary. The denominator in (2)
// satisfies 0 < (1-exp(-ε)) <= 1. The result of (2) is never NaN and
// only achieves +∞ when necessary overall.
if minPower >= 1 {
return expEpsilonPow(minPower-1) * (expEpsilonPow(numPowers) - 1) / (1 - expEpsilonPow(-1)), nil
}
return (expEpsilonPow(numPowers+minPower-1) - expEpsilonPow(minPower-1)) / (1 - expEpsilonPow(-1)), nil
}
// keepPartitionProbability calculates the value of keepPartitionProbability
// from PreAggSelectPartition's godoc comment.
func keepPartitionProbability(idCount, l0Sensitivity int64, epsilon, delta float64) (float64, error) {
if idCount <= 0 {
return 0, nil
}
// Per-partition (ε,δ) privacy loss.
pEpsilon := epsilon / float64(l0Sensitivity)
pDelta := delta / float64(l0Sensitivity)
// In keepPartitionProbability's recurrence formula (see Theorem 1 in the
// [Differentially private partition selection paper]), argument selection in
// the min operation has 3 distinct regions: min selects (1) on the lowest
// region of the domain, (2) on the second region of the domain, and (3)
// (i.e., the value 1) on the highest region of the domain. We denote by nCr
// the crossover point in the domain from (1) to (2).
nCr := int64(1 + math.Floor((1/pEpsilon)*math.Log(
(1+math.Exp(-pEpsilon)*(2*pDelta-1))/(pDelta*(1+math.Exp(-pEpsilon))))))
if idCount <= nCr {
// Closed form solution of keepPartitionProbability(n) on [0, nCr].
sum, err := sumExpPowers(pEpsilon, 0, idCount)
if err != nil {
return 0, err
}
return pDelta * sum, nil
}
sum, err := sumExpPowers(pEpsilon, 0, nCr)
if err != nil {
return 0, err
}
selectPartitionPrNCr := pDelta * sum
// Compute form solution of keepPartitionProbability(n) on the domain (nCr, ∞).
m := idCount - nCr
sum, err = sumExpPowers(pEpsilon, -m, m)
if err != nil {
return 0, err
}
return math.Min(
1+math.Exp(-float64(m)*pEpsilon)*(selectPartitionPrNCr-1)+sum*pDelta,
1), nil
}
// GetHardThreshold returns a threshold k, where if there are at least
// k privacy units in a partition, we are guaranteed to keep that partition.
//
// This is the conceptual equivalent of the post-aggregation threshold of the
// noise.Noise interface with the difference that here there is 0 probability
// of not keeping the partition if it has at least k privacy units, whereas
// with the post-aggregation threshold there is a non-zero probability
// (however small).
func (s *PreAggSelectPartition) GetHardThreshold() (int, error) {
for i := int64(1); ; i++ {
prob, err := keepPartitionProbability(i, s.l0Sensitivity, s.epsilon, s.delta)
if err != nil {
return 0, err
}
if prob >= 1 { // keepPartitionProbability converges to 1.
return int(i), nil
}
}
}
// encodablePreAggSelectPartition can be encoded by the gob package.
type encodablePreAggSelectPartition struct {
Epsilon float64
Delta float64
L0Sensitivity int64
IDCount int64
State aggregationState
PreThreshold int64
}
// GobEncode encodes PreAggSelectPartition.
func (s *PreAggSelectPartition) GobEncode() ([]byte, error) {
if s.state != defaultState && s.state != serialized {
return nil, fmt.Errorf("PreAggSelectPartition object cannot be serialized: " + s.state.errorMessage())
}
enc := encodablePreAggSelectPartition{
Epsilon: s.epsilon,
Delta: s.delta,
L0Sensitivity: s.l0Sensitivity,
IDCount: s.idCount,
State: s.state,
PreThreshold: s.preThreshold,
}
s.state = serialized
return encode(enc)
}
// GobDecode decodes PreAggSelectPartition.
func (s *PreAggSelectPartition) GobDecode(data []byte) error {
var enc encodablePreAggSelectPartition
err := decode(&enc, data)
if err != nil {
return fmt.Errorf("couldn't decode PreAggSelectPartition from bytes")
}
*s = PreAggSelectPartition{
epsilon: enc.Epsilon,
delta: enc.Delta,
l0Sensitivity: enc.L0Sensitivity,
idCount: enc.IDCount,
state: enc.State,
preThreshold: enc.PreThreshold,
}
return nil
}