Skip to content

Commit ae56f6d

Browse files
committed
db: add ability to collect a profile of value retrievals
With the introduction of value separation, retrievals of separated values are more expensive than retrievals of inlined values. Pebble exports an aggregate metric recording the total count of value retrievals that must load a value from an external blob file. This can be used to help attribute performance issues to excessive separated value retrievals, but it doesn't help with understanding what codepaths are performing the retrievals. This commit adds support for collection of a 'separated value retrieval' profile, aggregating the count and byte-total of value retrievals per unique stack trace while profile collection is active. This additional instrumentation adds some overhead and is too expensive to perform by default. Users may call DB.RecordSeparatedValueRetrievals to initiate collection of a profile. All subsequent opened iterators and compactions will record retrievals of separated values into the profile until the caller stops the collection with the invocation of a callback. For now this profile can only be dumped to a text format through a String method. We could export it to a pprof file but it would require a fair bit of work. Close #5535. Informs cockroachdb/cockroach#144715.
1 parent f11535e commit ae56f6d

File tree

10 files changed

+232
-29
lines changed

10 files changed

+232
-29
lines changed

compaction.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3390,6 +3390,7 @@ func (d *DB) compactAndWrite(
33903390
uint64(uintptr(unsafe.Pointer(c))),
33913391
categoryCompaction,
33923392
),
3393+
ValueRetrievalProfile: d.valueRetrievalProfile.Load(),
33933394
}
33943395
if c.version != nil {
33953396
c.iterationState.valueFetcher.Init(&c.version.BlobFiles, d.fileCache, blockReadEnv, suggestedCacheReaders)

db.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"github.com/cockroachdb/errors"
2121
"github.com/cockroachdb/pebble/internal/arenaskl"
2222
"github.com/cockroachdb/pebble/internal/base"
23+
"github.com/cockroachdb/pebble/internal/bytesprofile"
2324
"github.com/cockroachdb/pebble/internal/cache"
2425
"github.com/cockroachdb/pebble/internal/deletepacer"
2526
"github.com/cockroachdb/pebble/internal/inflight"
@@ -554,7 +555,8 @@ type DB struct {
554555

555556
compressionCounters block.CompressionCounters
556557

557-
iterTracker *inflight.Tracker
558+
iterTracker *inflight.Tracker
559+
valueRetrievalProfile atomic.Pointer[bytesprofile.Profile]
558560
}
559561

560562
var _ Reader = (*DB)(nil)
@@ -565,6 +567,27 @@ func (d *DB) TestOnlyWaitForCleaning() {
565567
d.deletePacer.WaitForTesting()
566568
}
567569

570+
// RecordSeparatedValueRetrievals begins collection of stack traces that are
571+
// retrieving separated values. If a profile has already begun, an error is
572+
// returned. If successful, the stop function must be called to stop the
573+
// collection and return the resulting profile.
574+
func (d *DB) RecordSeparatedValueRetrievals() (
575+
stop func() *metrics.ValueRetrievalProfile,
576+
err error,
577+
) {
578+
p := bytesprofile.NewProfile()
579+
swapped := d.valueRetrievalProfile.CompareAndSwap(nil, p)
580+
if !swapped {
581+
return nil, errors.New("separated value retrieval profile is already in progress")
582+
}
583+
return func() *metrics.ValueRetrievalProfile {
584+
if !d.valueRetrievalProfile.CompareAndSwap(p, nil) {
585+
panic(errors.AssertionFailedf("profile already stopped"))
586+
}
587+
return p
588+
}, nil
589+
}
590+
568591
// Set sets the value for the given key. It overwrites any previous value
569592
// for that key; a DB is not a multi-map.
570593
//
@@ -1106,6 +1129,7 @@ func (d *DB) newIter(
11061129
dbi.fc = d.fileCache
11071130
dbi.newIters = newIters
11081131
dbi.newIterRangeKey = newIterRangeKey
1132+
dbi.valueRetrievalProfile = d.valueRetrievalProfile.Load()
11091133
dbi.seqNum = seqNum
11101134
dbi.batchOnlyIter = newIterOpts.batch.batchOnly
11111135
if o != nil {
@@ -1255,6 +1279,7 @@ func (i *Iterator) constructPointIter(
12551279
uint64(uintptr(unsafe.Pointer(i))),
12561280
i.opts.Category,
12571281
),
1282+
ValueRetrievalProfile: i.valueRetrievalProfile,
12581283
}
12591284
if i.readState != nil {
12601285
i.blobValueFetcher.Init(&i.readState.current.BlobFiles, i.fc, readEnv,
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package bytesprofile
6+
7+
import (
8+
"cmp"
9+
"fmt"
10+
"maps"
11+
"runtime"
12+
"slices"
13+
"strings"
14+
"sync"
15+
16+
"github.com/cockroachdb/pebble/internal/humanize"
17+
)
18+
19+
// Profile is a profile mapping stack traces to a cumulative count and byte sum.
20+
type Profile struct {
21+
mu sync.Mutex
22+
samples map[stack]aggSamples
23+
}
24+
25+
// NewProfile creates a new profile.
26+
func NewProfile() *Profile {
27+
return &Profile{samples: make(map[stack]aggSamples)}
28+
}
29+
30+
type stack [20]uintptr
31+
32+
// trimmed returns the non-zero stack frames of stack.
33+
func (s stack) trimmed() []uintptr {
34+
for i := range s {
35+
if s[i] == 0 {
36+
return s[:i]
37+
}
38+
}
39+
return s[:]
40+
}
41+
42+
type aggSamples struct {
43+
bytes int64
44+
count int64
45+
}
46+
47+
// Record records a sample of the given number of bytes with the calling stack trace.
48+
func (p *Profile) Record(bytes int64) {
49+
var stack stack
50+
runtime.Callers(2, stack[:])
51+
p.mu.Lock()
52+
defer p.mu.Unlock()
53+
curr := p.samples[stack]
54+
curr.bytes += bytes
55+
curr.count++
56+
p.samples[stack] = curr
57+
}
58+
59+
// TODO(jackson): We could add the ability to export the profile to a pprof file
60+
// (which internally is just a protocol buffer). Ideally the Go standard library
61+
// would provide facilities for this (e.g., golang/go#18454). The runtime/pprof
62+
// library comes close with its definition of custom profiles, but they only
63+
// support profiles tracking in-use resources.
64+
65+
// String returns a string representation of the stacks captured by the profile.
66+
func (p *Profile) String() string {
67+
p.mu.Lock()
68+
defer p.mu.Unlock()
69+
// Sort the stacks by bytes in descending order.
70+
uniqueStacks := slices.SortedFunc(maps.Keys(p.samples), func(a, b stack) int {
71+
return -cmp.Compare(p.samples[a].bytes, p.samples[b].bytes)
72+
})
73+
var sb strings.Builder
74+
for i, stack := range uniqueStacks {
75+
if i > 0 {
76+
sb.WriteString("\n")
77+
}
78+
fmt.Fprintf(&sb, "%d: Count: %d (%s), Bytes: %d (%s)\n", i,
79+
p.samples[stack].count, humanize.Count.Int64(p.samples[stack].count),
80+
p.samples[stack].bytes, humanize.Bytes.Int64(p.samples[stack].bytes))
81+
frames := runtime.CallersFrames(stack.trimmed())
82+
for {
83+
frame, more := frames.Next()
84+
fmt.Fprintf(&sb, " %s\n %s:%d\n", frame.Function, frame.File, frame.Line)
85+
if !more {
86+
break
87+
}
88+
}
89+
}
90+
return sb.String()
91+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package bytesprofile
6+
7+
import (
8+
"testing"
9+
10+
"github.com/stretchr/testify/require"
11+
)
12+
13+
func TestBytesProfile(t *testing.T) {
14+
p := NewProfile()
15+
for i := 0; i < 100; i++ {
16+
p.Record(int64(i))
17+
}
18+
for i := 0; i < 10; i++ {
19+
p.Record(11 * int64(i))
20+
}
21+
s := p.String()
22+
require.Contains(t, s, "0: Count: 100 (100), Bytes: 4950 (4.8KB)")
23+
require.Contains(t, s, "1: Count: 10 (10), Bytes: 495 (495B)")
24+
t.Log(s)
25+
}

iterator.go

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/cockroachdb/errors"
1616
"github.com/cockroachdb/pebble/internal/base"
1717
"github.com/cockroachdb/pebble/internal/bytealloc"
18+
"github.com/cockroachdb/pebble/internal/bytesprofile"
1819
"github.com/cockroachdb/pebble/internal/humanize"
1920
"github.com/cockroachdb/pebble/internal/inflight"
2021
"github.com/cockroachdb/pebble/internal/invariants"
@@ -260,11 +261,12 @@ type Iterator struct {
260261
externalIter *externalIterState
261262
// Following fields used when constructing an iterator stack, eg, in Clone
262263
// and SetOptions or when re-fragmenting a batch's range keys/range dels.
263-
fc *fileCacheHandle
264-
newIters tableNewIters
265-
newIterRangeKey keyspanimpl.TableNewSpanIter
266-
lazyCombinedIter lazyCombinedIter
267-
seqNum base.SeqNum
264+
fc *fileCacheHandle
265+
newIters tableNewIters
266+
newIterRangeKey keyspanimpl.TableNewSpanIter
267+
valueRetrievalProfile *bytesprofile.Profile
268+
lazyCombinedIter lazyCombinedIter
269+
seqNum base.SeqNum
268270
// batch is non-nil if this Iterator includes an indexed batch. Batch
269271
// contains all the state pertaining to iterating over the indexed batch.
270272
// The iteratorBatchState struct is bundled within the iterAlloc struct to
@@ -2889,21 +2891,22 @@ func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*It
28892891
buf := newIterAlloc()
28902892
dbi := &buf.dbi
28912893
*dbi = Iterator{
2892-
ctx: ctx,
2893-
opts: *opts.IterOptions,
2894-
alloc: buf,
2895-
merge: i.merge,
2896-
comparer: i.comparer,
2897-
readState: readState,
2898-
version: vers,
2899-
keyBuf: buf.keyBuf,
2900-
prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
2901-
boundsBuf: buf.boundsBuf,
2902-
fc: i.fc,
2903-
newIters: i.newIters,
2904-
newIterRangeKey: i.newIterRangeKey,
2905-
seqNum: i.seqNum,
2906-
tracker: i.tracker,
2894+
ctx: ctx,
2895+
opts: *opts.IterOptions,
2896+
alloc: buf,
2897+
merge: i.merge,
2898+
comparer: i.comparer,
2899+
readState: readState,
2900+
version: vers,
2901+
keyBuf: buf.keyBuf,
2902+
prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
2903+
boundsBuf: buf.boundsBuf,
2904+
fc: i.fc,
2905+
newIters: i.newIters,
2906+
newIterRangeKey: i.newIterRangeKey,
2907+
valueRetrievalProfile: i.valueRetrievalProfile,
2908+
seqNum: i.seqNum,
2909+
tracker: i.tracker,
29072910
}
29082911
if i.tracker != nil && !dbi.opts.ExemptFromTracking {
29092912
dbi.trackerHandle = i.tracker.Start()

iterator_test.go

Lines changed: 41 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1415,6 +1415,39 @@ func TestIteratorStatsMerge(t *testing.T) {
14151415
require.Equal(t, expected, s)
14161416
}
14171417

1418+
func TestIteratorValueRetrievalProfile(t *testing.T) {
1419+
opts := &Options{}
1420+
opts.FormatMajorVersion = internalFormatNewest
1421+
opts.Experimental.ValueSeparationPolicy = func() ValueSeparationPolicy {
1422+
return ValueSeparationPolicy{
1423+
Enabled: true,
1424+
MinimumSize: 1,
1425+
MaxBlobReferenceDepth: 5,
1426+
}
1427+
}
1428+
d := newTestkeysDatabase(t, opts, testkeys.Alpha(2), rand.New(rand.NewPCG(1, 1)))
1429+
defer func() { require.NoError(t, d.Close()) }()
1430+
require.NoError(t, d.Flush())
1431+
stop, err := d.RecordSeparatedValueRetrievals()
1432+
require.NoError(t, err)
1433+
{
1434+
_, err := d.RecordSeparatedValueRetrievals()
1435+
require.Error(t, err, "should not be able to start a second profile")
1436+
}
1437+
it, err := d.NewIter(nil)
1438+
require.NoError(t, err)
1439+
defer it.Close()
1440+
require.True(t, it.SeekGE([]byte("a")))
1441+
val, err := it.ValueAndErr()
1442+
require.NoError(t, err)
1443+
require.NotNil(t, val)
1444+
p := stop()
1445+
require.NotNil(t, p)
1446+
s := p.String()
1447+
require.NotEmpty(t, s)
1448+
t.Log(s)
1449+
}
1450+
14181451
// TestSetOptionsEquivalence tests equivalence between SetOptions to mutate an
14191452
// iterator and constructing a new iterator with NewIter. The long-lived
14201453
// iterator and the new iterator should surface identical iterator states.
@@ -1428,7 +1461,7 @@ func TestSetOptionsEquivalence(t *testing.T) {
14281461
func testSetOptionsEquivalence(t *testing.T, seed uint64) {
14291462
rng := rand.New(rand.NewPCG(seed, seed))
14301463
ks := testkeys.Alpha(2)
1431-
d := newTestkeysDatabase(t, ks, rng)
1464+
d := newTestkeysDatabase(t, nil /* dbOpts */, ks, rng)
14321465
defer func() { require.NoError(t, d.Close()) }()
14331466

14341467
var o IterOptions
@@ -1571,12 +1604,13 @@ func iterOptionsString(o *IterOptions) string {
15711604
return buf.String()
15721605
}
15731606

1574-
func newTestkeysDatabase(t *testing.T, ks testkeys.Keyspace, rng *rand.Rand) *DB {
1575-
dbOpts := &Options{
1576-
Comparer: testkeys.Comparer,
1577-
FS: vfs.NewMem(),
1578-
Logger: panicLogger{},
1607+
func newTestkeysDatabase(t *testing.T, dbOpts *Options, ks testkeys.Keyspace, rng *rand.Rand) *DB {
1608+
if dbOpts == nil {
1609+
dbOpts = &Options{}
15791610
}
1611+
dbOpts.Comparer = testkeys.Comparer
1612+
dbOpts.FS = vfs.NewMem()
1613+
dbOpts.Logger = panicLogger{}
15801614
dbOpts.randomizeForTesting(t)
15811615
d, err := Open("", dbOpts)
15821616
require.NoError(t, err)
@@ -2272,7 +2306,7 @@ func TestIteratorSeekPrefixGERandomized(t *testing.T) {
22722306
t.Logf("seed: %d", seed)
22732307
rng := rand.New(rand.NewPCG(seed, seed))
22742308
ks := testkeys.Alpha(2)
2275-
d := newTestkeysDatabase(t, ks, rng)
2309+
d := newTestkeysDatabase(t, nil /* dbOpts */, ks, rng)
22762310
defer func() { require.NoError(t, d.Close()) }()
22772311

22782312
// Scan through the keys and construct a map from unique prefix to the

metrics/value_retrieval_profile.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package metrics
6+
7+
import "github.com/cockroachdb/pebble/internal/bytesprofile"
8+
9+
// ValueRetrievalProfile is a profile of separated value retrievals.
10+
type ValueRetrievalProfile = bytesprofile.Profile

scan_internal.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,8 @@ func (d *DB) newInternalIter(
201201
if d.iterTracker != nil {
202202
dbi.trackerHandle = d.iterTracker.Start()
203203
}
204-
dbi.blobValueFetcher.Init(&vers.BlobFiles, d.fileCache, block.ReadEnv{},
204+
dbi.blobValueFetcher.Init(&vers.BlobFiles, d.fileCache,
205+
block.ReadEnv{ValueRetrievalProfile: d.valueRetrievalProfile.Load()},
205206
blob.SuggestedCachedReaders(vers.MaxReadAmp()))
206207

207208
dbi.opts = *o

sstable/blob/fetcher.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,14 @@ func (cr *cachedReader) GetUnsafeValue(
340340
cr.currentValueBlock.loaded = true
341341
}
342342

343+
// If the ReadEnv is configured with a value-retrieval profile, record the
344+
// value retrieval to it. This is used to allow runtime profiling of value
345+
// retrievals, providing observability into which codepaths are responsible
346+
// for the comparatively expensive value retrievals.
347+
if env.ValueRetrievalProfile != nil {
348+
env.ValueRetrievalProfile.Record(int64(vh.ValueLen))
349+
}
350+
343351
// Convert the ValueID to an index into the block's values. When a blob file
344352
// is first constructed, the ValueID == the index. However when a blob file
345353
// is rewritten, multiple blocks from the original blob file may be combined

sstable/block/block.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"github.com/cockroachdb/errors"
2020
"github.com/cockroachdb/pebble/internal/base"
2121
"github.com/cockroachdb/pebble/internal/bitflip"
22+
"github.com/cockroachdb/pebble/internal/bytesprofile"
2223
"github.com/cockroachdb/pebble/internal/cache"
2324
"github.com/cockroachdb/pebble/internal/crc"
2425
"github.com/cockroachdb/pebble/internal/invariants"
@@ -284,6 +285,10 @@ type ReadEnv struct {
284285
// more details.
285286
ReportCorruptionFn func(base.ObjectInfo, error) error
286287
ReportCorruptionArg base.ObjectInfo
288+
289+
// ValueRetrievalProfile, if non-nil, is used to record stack traces of
290+
// retrievals of separated values. It's only used by the blob.ValueFetcher.
291+
ValueRetrievalProfile *bytesprofile.Profile
287292
}
288293

289294
// BlockServedFromCache updates the stats when a block was found in the cache.

0 commit comments

Comments
 (0)