-
Notifications
You must be signed in to change notification settings - Fork 6
/
optimize.go
201 lines (173 loc) · 5.49 KB
/
optimize.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
// Copyright 2020 the Blobloom authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package blobloom
import "math"
// A Config holds parameters for Optimize or NewOptimized.
type Config struct {
// Capacity is the expected number of distinct keys to be added.
// More keys can always be added, but the false positive rate can be
// expected to drop below FPRate if their number exceeds the Capacity.
Capacity uint64
// Desired lower bound on the false positive rate when the Bloom filter
// has been filled to its capacity. FPRate must be between zero
// (exclusive) and one (inclusive).
FPRate float64
// Maximum size of the Bloom filter in bits. Zero means the global
// MaxBits constant. A value less than BlockBits means BlockBits.
MaxBits uint64
// Trigger the "contains filtered or unexported fields" message for
// forward compatibility and force the caller to use named fields.
_ struct{}
}
// NewOptimized is shorthand for New(Optimize(config)).
func NewOptimized(config Config) *Filter {
return New(Optimize(config))
}
// NewSyncOptimized is shorthand for New(Optimize(config)).
func NewSyncOptimized(config Config) *SyncFilter {
return NewSync(Optimize(config))
}
// Optimize returns numbers of keys and hash functions that achieve the
// desired false positive described by config.
//
// Optimize panics when config.FPRate is invalid.
//
// The estimated number of bits is imprecise for false positives rates below
// ca. 1e-15.
func Optimize(config Config) (nbits uint64, nhashes int) {
n := float64(config.Capacity)
p := config.FPRate
if p <= 0 || p > 1 {
panic("false positive rate for a Bloom filter must be > 0, <= 1")
}
if n == 0 {
// Assume the client wants to add at least one key; log2(0) = -inf.
n = 1
}
// The optimal nbits/n is c = -log2(p) / ln(2) for a vanilla Bloom filter.
c := math.Ceil(-math.Log2(p) / math.Ln2)
if c < float64(len(correctC)) {
c = float64(correctC[int(c)])
} else {
// We can't achieve the desired FPR. Just triple the number of bits.
c *= 3
}
nbits = uint64(c * n)
// Round up to a multiple of BlockBits.
if nbits%BlockBits != 0 {
nbits += BlockBits - nbits%BlockBits
}
var maxbits uint64 = MaxBits
if config.MaxBits != 0 && config.MaxBits < maxbits {
maxbits = config.MaxBits
if maxbits < BlockBits {
maxbits = BlockBits
}
}
if nbits > maxbits {
nbits = maxbits
// Round down to a multiple of BlockBits.
nbits -= nbits % BlockBits
}
// The corresponding optimal number of hash functions is k = c * log(2).
// Try rounding up and down to see which rounding is better.
c = float64(nbits) / n
k := c * math.Ln2
if k < 1 {
nhashes = 1
return nbits, nhashes
}
ceilK, floorK := math.Floor(k), math.Ceil(k)
if ceilK == floorK {
return nbits, int(ceilK)
}
fprCeil, _ := fpRate(c, math.Ceil(k))
fprFloor, _ := fpRate(c, math.Floor(k))
if fprFloor < fprCeil {
k = floorK
} else {
k = ceilK
}
return nbits, int(k)
}
// correctC maps c = m/n for a vanilla Bloom filter to the c' for a
// blocked Bloom filter.
//
// This is Putze et al.'s Table I, extended down to zero.
// For c > 34, the values become huge and are hard to compute.
var correctC = []byte{
1, 1, 2, 4, 5,
6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 23,
25, 26, 28, 30, 32, 35, 38, 40, 44, 48, 51, 58, 64, 74, 90,
}
// FPRate computes an estimate of the false positive rate of a Bloom filter
// after nkeys distinct keys have been added.
func FPRate(nkeys, nbits uint64, nhashes int) float64 {
if nkeys == 0 {
return 0
}
p, _ := fpRate(float64(nbits)/float64(nkeys), float64(nhashes))
return p
}
func fpRate(c, k float64) (p float64, iter int) {
switch {
case c == 0:
panic("0 bits per key is too few")
case k == 0:
panic("0 hashes is too few")
}
// Putze et al.'s Equation (3).
//
// The Poisson distribution has a single spike around its mean
// BlockBits/c that gets slimmer and further away from zero as c tends
// to zero (the Bloom filter gets more filled). We start at the mean,
// then add terms left and right of it until their relative contribution
// drops below ε.
const ε = 1e-9
mean := BlockBits / c
// Ceil to make sure we start at one, not zero.
i := math.Ceil(mean)
p = math.Exp(logPoisson(mean, i) + logFprBlock(BlockBits/i, k))
for j := i - 1; j > 0; j-- {
add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))
p += add
iter++
if add/p < ε {
break
}
}
for j := i + 1; ; j++ {
add := math.Exp(logPoisson(mean, j) + logFprBlock(BlockBits/j, k))
p += add
iter++
if add/p < ε {
break
}
}
return p, iter
}
// FPRate computes an estimate of f's false positive rate after nkeys distinct
// keys have been added.
func (f *Filter) FPRate(nkeys uint64) float64 {
return FPRate(nkeys, f.NumBits(), f.k)
}
// Log of the FPR of a single block, FPR = (1 - exp(-k/c))^k.
func logFprBlock(c, k float64) float64 {
return k * math.Log1p(-math.Exp(-k/c))
}
// Log of the Poisson distribution's pmf.
func logPoisson(λ, k float64) float64 {
lg, _ := math.Lgamma(k + 1)
return k*math.Log(λ) - λ - lg
}