Skip to content

Commit 0c357d6

Browse files
committed
ewma: add a per-bye EWMA estimator
This will be used to estimate compression ratio of blocks based on the compression ratios of recently sampled blocks.
1 parent 34c1017 commit 0c357d6

File tree

2 files changed

+163
-0
lines changed

2 files changed

+163
-0
lines changed

internal/ewma/ewma_bytes.go

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package ewma
6+
7+
import (
8+
"math"
9+
10+
"github.com/cockroachdb/pebble/internal/invariants"
11+
)
12+
13+
// Bytes is an estimator for an arbitrary value that is sampled from byte
14+
// blocks.
15+
//
16+
// Consider a stream of data which is divided into blocks of varying size. We
17+
// want to estimate a value (like compression ratio) based on the values from
18+
// recent blocks.
19+
//
20+
// Bytes implements a per-byte exponential moving average (EWMA) estimator: let
21+
// pos_i and val_i be the position and value of each byte for which we have
22+
// data; the estimate at position p is the weighted sum:
23+
//
24+
// Sum_i val_i*(1-alpha)^(p-pos_i)
25+
// -------------------------------
26+
// Sum_i (1-alpha)^(p-pos_i)
27+
type Bytes struct {
28+
alpha float64
29+
sum float64
30+
totalWeight float64
31+
gap int64
32+
}
33+
34+
// Init the estimator such that a block sampled <half-life> bytes ago has half
35+
// the weight compared to a block sampled now.
36+
//
37+
// Intuitively, half of the estimate comes from the values within the half-life
38+
// window; and 75% of the estimate comes from values within 2x half-life.
39+
func (b *Bytes) Init(halfLife int64) {
40+
*b = Bytes{}
41+
// Exact value is 1 - 2^(-1/H). The straightforward calculation suffers from
42+
// precision loss as H grows (we are subtracting two nearly equal numbers). We
43+
// use a numerically stable alternative:
44+
// 1 - 2^(-1/H) = 1 - e^(-ln(2)/H) = -expm1(-ln(2)/H)
45+
b.alpha = -math.Expm1(-math.Ln2 / float64(halfLife))
46+
}
47+
48+
// Estimate returns the current estimate of the value, based on the recent
49+
// SampledBlock() calls. Returns NaN if no blocks have been sampled yet.
50+
func (b *Bytes) Estimate() float64 {
51+
return b.sum / b.totalWeight
52+
}
53+
54+
// NoSample informs the estimator that a block of the given length was not
55+
// sampled.
56+
func (b *Bytes) NoSample(numBytes int64) {
57+
if numBytes < 0 {
58+
if invariants.Enabled {
59+
panic("invalid numBytes")
60+
}
61+
return
62+
}
63+
// It would be equivalent (but less efficient) to multiply both sum and
64+
// totalWeight by (1-alpha)^numBytes instead of keeping track of the gap.
65+
b.gap += numBytes
66+
}
67+
68+
// SampledBlock informs the estimator that a block of the given length was
69+
// sampled.
70+
func (b *Bytes) SampledBlock(numBytes int64, value float64) {
71+
if numBytes < 1 {
72+
if invariants.Enabled {
73+
panic("invalid numBytes")
74+
}
75+
return
76+
}
77+
decay := b.decay(b.gap + numBytes)
78+
b.sum *= decay
79+
b.totalWeight *= decay
80+
b.gap = 0
81+
82+
// The sum of weights for the new bytes is:
83+
//
84+
// 1 - (1 - alpha)^numBytes
85+
// Sum (1 - alpha)^i = ------------------------
86+
// 0≤i<numBytes alpha
87+
//
88+
// We can drop the 1/alpha factor from all weights (it cancels out).
89+
w := 1 - b.decay(numBytes)
90+
b.sum += value * w
91+
b.totalWeight += w
92+
}
93+
94+
// decay returns (1 - alpha)^n for the given n.
95+
func (b *Bytes) decay(n int64) float64 {
96+
// (1 - alpha)^n = e^(n * log(1 - alpha)). Using Exp and Log1p is stable for
97+
// very small alpha (unlike math.Pow).
98+
return math.Exp(float64(n) * math.Log1p(-b.alpha))
99+
}

internal/ewma/ewma_bytes_test.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package ewma
6+
7+
import (
8+
"fmt"
9+
"math"
10+
"testing"
11+
12+
"github.com/stretchr/testify/require"
13+
)
14+
15+
func TestBytes(t *testing.T) {
16+
const eps = 0.1
17+
var b Bytes
18+
19+
b.Init(1000)
20+
21+
require.True(t, math.IsNaN(b.Estimate()))
22+
23+
b.SampledBlock(100, 1.0)
24+
require.InEpsilon(t, 1.0, b.Estimate(), eps)
25+
26+
b.NoSample(10000)
27+
require.InEpsilon(t, 1.0, b.Estimate(), eps)
28+
b.SampledBlock(100, 10.0)
29+
require.InEpsilon(t, 10.0, b.Estimate(), eps)
30+
31+
b.NoSample(10000)
32+
b.SampledBlock(10, 0.0)
33+
b.SampledBlock(10, 10.0)
34+
b.SampledBlock(10, 0.0)
35+
b.SampledBlock(10, 10.0)
36+
require.InEpsilon(t, 5.0, b.Estimate(), eps)
37+
b.NoSample(1000)
38+
b.SampledBlock(10, 0.0)
39+
require.InEpsilon(t, 3.3, b.Estimate(), eps)
40+
41+
b.Init(1000)
42+
b.SampledBlock(1, 0.0)
43+
b.NoSample(1000)
44+
b.SampledBlock(1, 1.0)
45+
// The byte 1 half-life ago matters 1/2 as much, so the estimate is 2/3.
46+
require.InEpsilon(t, 0.66, b.Estimate(), eps)
47+
}
48+
49+
// TestBytesHalfLife verifies that the alpha and decay calculations are accurate.
50+
func TestBytesHalfLife(t *testing.T) {
51+
for _, n := range []int64{1, 2, 3, 10, 100, 1000, 10_000, 1 << 20, 128 << 20, 1 << 30} {
52+
t.Run(fmt.Sprint(n), func(t *testing.T) {
53+
var b Bytes
54+
b.Init(n)
55+
const eps = 1e-8
56+
require.InEpsilon(t, 1.0/2, b.decay(n), eps)
57+
require.InDelta(t, 1.0/4, b.decay(2*n), eps)
58+
require.InDelta(t, 1.0/8, b.decay(3*n), eps)
59+
require.InDelta(t, 1.0/16, b.decay(4*n), eps)
60+
require.InDelta(t, 1.0/32, b.decay(5*n), eps)
61+
require.InDelta(t, 1.0/64, b.decay(6*n), eps)
62+
})
63+
}
64+
}

0 commit comments

Comments
 (0)