Skip to content

Commit ab9ffc9

Browse files
committed
block,cache: trace and stats for wait duration (for concurrent reads)
cache.Handle.GetWithReadHandle returns a non-zero wait duration on a cache miss, which resulted in the request waiting for a turn to do the read, or to wait for the block to be read by another reader. Tracing now happens on high wait duration. Wait duration is included in iterator stats, since this is a block miss. And wait duration is included in the read duration when the block was read by the caller after waiting.
1 parent 1f091ae commit ab9ffc9

File tree

9 files changed

+140
-93
lines changed

9 files changed

+140
-93
lines changed

db_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ import (
3030
"github.com/cockroachdb/pebble/internal/testutils"
3131
"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
3232
"github.com/cockroachdb/pebble/sstable"
33-
"github.com/cockroachdb/pebble/sstable/block"
3433
"github.com/cockroachdb/pebble/vfs"
3534
"github.com/cockroachdb/pebble/vfs/errorfs"
3635
"github.com/cockroachdb/pebble/wal"
@@ -1493,7 +1492,7 @@ func (t *testTracer) IsTracingEnabled(ctx context.Context) bool {
14931492
}
14941493

14951494
func TestTracing(t *testing.T) {
1496-
defer block.DeterministicReadBlockDurationForTesting()()
1495+
defer base.DeterministicReadDurationForTesting()()
14971496

14981497
var tracer testTracer
14991498
buf := &tracer.buf

internal/base/stopwatch.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package base
6+
7+
import (
8+
"time"
9+
10+
"github.com/cockroachdb/crlib/crtime"
11+
)
12+
13+
// DeterministicReadDurationForTesting is for tests that want a deterministic
14+
// value of the time to read (that is not in the cache). The return value is a
15+
// function that must be called before the test exits.
16+
func DeterministicReadDurationForTesting() func() {
17+
drbdForTesting := deterministicReadDurationForTesting
18+
deterministicReadDurationForTesting = true
19+
return func() {
20+
deterministicReadDurationForTesting = drbdForTesting
21+
}
22+
}
23+
24+
var deterministicReadDurationForTesting = false
25+
26+
type deterministicStopwatchForTesting struct {
27+
startTime crtime.Mono
28+
}
29+
30+
func MakeStopwatch() deterministicStopwatchForTesting {
31+
return deterministicStopwatchForTesting{startTime: crtime.NowMono()}
32+
}
33+
34+
func (w deterministicStopwatchForTesting) Stop() time.Duration {
35+
dur := w.startTime.Elapsed()
36+
if deterministicReadDurationForTesting {
37+
dur = SlowReadTracingThreshold
38+
}
39+
return dur
40+
}
41+
42+
// TODO(sumeer): should the threshold be configurable.
43+
const SlowReadTracingThreshold = 5 * time.Millisecond

internal/cache/cache.go

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,8 @@ func (c *Handle) Get(fileNum base.DiskFileNum, offset uint64) *Value {
280280
// This method can block before returning since multiple concurrent gets for
281281
// the same cache value will take turns getting a ReadHandle, which represents
282282
// permission to do the read. This blocking respects context cancellation, in
283-
// which case an error is returned (and not a valid ReadHandle).
283+
// which case an error is returned (and not a valid ReadHandle). The
284+
// waitDuration returns the duration of the blocking.
284285
//
285286
// When blocking, the errorDuration return value can be non-zero and is
286287
// populated with the total duration that other readers that observed an error
@@ -294,18 +295,27 @@ func (c *Handle) Get(fileNum base.DiskFileNum, offset uint64) *Value {
294295
// in a valid Handle being returned. This is a case where cacheHit=false.
295296
func (c *Handle) GetWithReadHandle(
296297
ctx context.Context, fileNum base.DiskFileNum, offset uint64,
297-
) (cv *Value, rh ReadHandle, errorDuration time.Duration, cacheHit bool, err error) {
298+
) (
299+
cv *Value,
300+
rh ReadHandle,
301+
errorDuration time.Duration,
302+
waitDuration time.Duration,
303+
cacheHit bool,
304+
err error,
305+
) {
298306
k := makeKey(c.id, fileNum, offset)
299307
cv, re := c.cache.getShard(k).getWithMaybeReadEntry(k, true /* desireReadEntry */)
300308
if cv != nil {
301-
return cv, ReadHandle{}, 0, true, nil
309+
return cv, ReadHandle{}, 0, 0, true, nil
302310
}
311+
waitStopwatch := base.MakeStopwatch()
303312
cv, errorDuration, err = re.waitForReadPermissionOrHandle(ctx)
313+
waitDuration = waitStopwatch.Stop()
304314
if err != nil || cv != nil {
305315
re.unrefAndTryRemoveFromMap()
306-
return cv, ReadHandle{}, errorDuration, false, err
316+
return cv, ReadHandle{}, errorDuration, waitDuration, false, err
307317
}
308-
return nil, ReadHandle{entry: re}, errorDuration, false, nil
318+
return nil, ReadHandle{entry: re}, errorDuration, waitDuration, false, nil
309319
}
310320

311321
// Set sets the cache value for the specified file and offset, overwriting an

internal/cache/read_shard_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,7 @@ func TestReadShardConcurrent(t *testing.T) {
254254
for _, r := range differentReaders {
255255
for j := 0; j < r.numReaders; j++ {
256256
go func(r *testSyncReaders, index int) {
257-
v, rh, _, _, err := r.handle.GetWithReadHandle(context.Background(), r.fileNum, r.offset)
257+
v, rh, _, _, _, err := r.handle.GetWithReadHandle(context.Background(), r.fileNum, r.offset)
258258
require.NoError(t, err)
259259
if v != nil {
260260
require.Equal(t, r.val, v.RawBuffer())

metrics_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,7 @@ func TestMetrics(t *testing.T) {
195195
if runtime.GOARCH == "386" {
196196
t.Skip("skipped on 32-bit due to slightly varied output")
197197
}
198-
defer block.DeterministicReadBlockDurationForTesting()()
198+
defer base.DeterministicReadDurationForTesting()()
199199

200200
var d *DB
201201
var iters map[string]*Iterator

sstable/block/block.go

Lines changed: 34 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import (
1515
"unsafe"
1616

1717
"github.com/cespare/xxhash/v2"
18-
"github.com/cockroachdb/crlib/crtime"
1918
"github.com/cockroachdb/crlib/fifo"
2019
"github.com/cockroachdb/errors"
2120
"github.com/cockroachdb/pebble/internal/base"
@@ -28,6 +27,7 @@ import (
2827
"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
2928
"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
3029
"github.com/cockroachdb/pebble/sstable/block/blockkind"
30+
"github.com/cockroachdb/redact"
3131
)
3232

3333
// Kind is a convenience alias.
@@ -371,18 +371,23 @@ func (r *Reader) Read(
371371
return CacheBufferHandle(cv), nil
372372
}
373373
}
374-
value, err := r.doRead(ctx, env, readHandle, bh, kind, initBlockMetadataFn)
374+
value, err := r.doRead(ctx, env, readHandle, bh, kind, 0, initBlockMetadataFn)
375375
if err != nil {
376376
return BufferHandle{}, env.maybeReportCorruption(err)
377377
}
378378
return value.MakeHandle(), nil
379379
}
380380

381-
cv, crh, errorDuration, hit, err := r.opts.CacheOpts.CacheHandle.GetWithReadHandle(
381+
cv, crh, errorDuration, waitDuration, hit, err := r.opts.CacheOpts.CacheHandle.GetWithReadHandle(
382382
ctx, r.opts.CacheOpts.FileNum, bh.Offset)
383-
if errorDuration > 5*time.Millisecond && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
383+
const slowDur = 5 * time.Millisecond
384+
if waitDuration > slowDur && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
384385
r.opts.LoggerAndTracer.Eventf(
385-
ctx, "waited for turn when %s time wasted by failed reads", errorDuration.String())
386+
ctx, "waited for reading turn for %v", waitDuration)
387+
}
388+
if errorDuration > slowDur && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
389+
r.opts.LoggerAndTracer.Eventf(
390+
ctx, "failed reads by others wasted %v", errorDuration)
386391
}
387392
// TODO(sumeer): consider tracing when waited longer than some duration
388393
// for turn to do the read.
@@ -400,11 +405,15 @@ func (r *Reader) Read(
400405
}
401406
if hit {
402407
recordCacheHit(ctx, env, readHandle, bh, kind)
408+
} else {
409+
// The block was not in the cache, and someone else read it, but this
410+
// caller had to wait for that read. So account for it in the stats.
411+
env.BlockRead(kind, bh.Length, waitDuration)
403412
}
404413
return CacheBufferHandle(cv), nil
405414
}
406415

407-
value, err := r.doRead(ctx, env, readHandle, bh, kind, initBlockMetadataFn)
416+
value, err := r.doRead(ctx, env, readHandle, bh, kind, waitDuration, initBlockMetadataFn)
408417
if err != nil {
409418
crh.SetReadError(err)
410419
return BufferHandle{}, env.maybeReportCorruption(err)
@@ -423,9 +432,6 @@ func recordCacheHit(
423432
env.BlockServedFromCache(kind, bh.Length)
424433
}
425434

426-
// TODO(sumeer): should the threshold be configurable.
427-
const slowReadTracingThreshold = 5 * time.Millisecond
428-
429435
// doRead is a helper for Read that does the read, checksum check,
430436
// decompression, and returns either a Value or an error.
431437
func (r *Reader) doRead(
@@ -434,6 +440,7 @@ func (r *Reader) doRead(
434440
readHandle objstorage.ReadHandle,
435441
bh Handle,
436442
kind Kind,
443+
waitBeforeReadDuration time.Duration,
437444
initBlockMetadataFn func(*Metadata, []byte) error,
438445
) (Value, error) {
439446
ctx = objiotracing.WithBlockKind(ctx, kind)
@@ -447,21 +454,28 @@ func (r *Reader) doRead(
447454
}
448455

449456
compressed := Alloc(int(bh.Length+TrailerLen), env.BufferPool)
450-
readStopwatch := makeStopwatch()
457+
readStopwatch := base.MakeStopwatch()
451458
var err error
452459
if readHandle != nil {
453460
err = readHandle.ReadAt(ctx, compressed.BlockData(), int64(bh.Offset))
454461
} else {
455462
err = r.readable.ReadAt(ctx, compressed.BlockData(), int64(bh.Offset))
456463
}
457-
readDuration := readStopwatch.stop()
464+
readDuration := readStopwatch.Stop()
458465
// Call IsTracingEnabled to avoid the allocations of boxing integers into an
459466
// interface{}, unless necessary.
460-
if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
467+
if (readDuration+waitBeforeReadDuration) >= base.SlowReadTracingThreshold &&
468+
r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
461469
_, file1, line1, _ := runtime.Caller(1)
462470
_, file2, line2, _ := runtime.Caller(2)
463-
r.opts.LoggerAndTracer.Eventf(ctx, "reading block of %d bytes took %s (fileNum=%s; %s/%s:%d -> %s/%s:%d)",
464-
int(bh.Length+TrailerLen), readDuration.String(),
471+
var waitDurStr string
472+
if waitBeforeReadDuration > 0 {
473+
waitDurStr = fmt.Sprintf("+ %v wait ", waitBeforeReadDuration)
474+
}
475+
r.opts.LoggerAndTracer.Eventf(
476+
ctx, "reading block kind %s of %d bytes took %v %s(fileNum=%s; %s/%s:%d -> %s/%s:%d)",
477+
redact.SafeString(kind.String()),
478+
int(bh.Length+TrailerLen), readDuration, redact.SafeString(waitDurStr),
465479
r.opts.CacheOpts.FileNum,
466480
filepath.Base(filepath.Dir(file2)), filepath.Base(file2), line2,
467481
filepath.Base(filepath.Dir(file1)), filepath.Base(file1), line1)
@@ -470,7 +484,7 @@ func (r *Reader) doRead(
470484
compressed.Release()
471485
return Value{}, err
472486
}
473-
env.BlockRead(kind, bh.Length, readDuration)
487+
env.BlockRead(kind, bh.Length, readDuration+waitBeforeReadDuration)
474488
if err = ValidateChecksum(r.checksumType, compressed.BlockData(), bh); err != nil {
475489
compressed.Release()
476490
err = errors.Wrapf(err, "pebble: file %s", r.opts.CacheOpts.FileNum)
@@ -557,51 +571,22 @@ func ReadRaw(
557571
return nil, base.CorruptionErrorf("pebble: invalid file %s (file size is too small)", errors.Safe(fileNum))
558572
}
559573

560-
readStopwatch := makeStopwatch()
574+
readStopwatch := base.MakeStopwatch()
561575
var err error
562576
if readHandle != nil {
563577
err = readHandle.ReadAt(ctx, buf, off)
564578
} else {
565579
err = f.ReadAt(ctx, buf, off)
566580
}
567-
readDuration := readStopwatch.stop()
581+
readDuration := readStopwatch.Stop()
568582
// Call IsTracingEnabled to avoid the allocations of boxing integers into an
569583
// interface{}, unless necessary.
570-
if readDuration >= slowReadTracingThreshold && logger.IsTracingEnabled(ctx) {
571-
logger.Eventf(ctx, "reading footer of %d bytes took %s",
572-
len(buf), readDuration.String())
584+
if readDuration >= base.SlowReadTracingThreshold && logger.IsTracingEnabled(ctx) {
585+
logger.Eventf(ctx, "reading footer of %d bytes took %v",
586+
len(buf), readDuration)
573587
}
574588
if err != nil {
575589
return nil, errors.Wrap(err, "pebble: invalid file (could not read footer)")
576590
}
577591
return buf, nil
578592
}
579-
580-
// DeterministicReadBlockDurationForTesting is for tests that want a
581-
// deterministic value of the time to read a block (that is not in the cache).
582-
// The return value is a function that must be called before the test exits.
583-
func DeterministicReadBlockDurationForTesting() func() {
584-
drbdForTesting := deterministicReadBlockDurationForTesting
585-
deterministicReadBlockDurationForTesting = true
586-
return func() {
587-
deterministicReadBlockDurationForTesting = drbdForTesting
588-
}
589-
}
590-
591-
var deterministicReadBlockDurationForTesting = false
592-
593-
type deterministicStopwatchForTesting struct {
594-
startTime crtime.Mono
595-
}
596-
597-
func makeStopwatch() deterministicStopwatchForTesting {
598-
return deterministicStopwatchForTesting{startTime: crtime.NowMono()}
599-
}
600-
601-
func (w deterministicStopwatchForTesting) stop() time.Duration {
602-
dur := w.startTime.Elapsed()
603-
if deterministicReadBlockDurationForTesting {
604-
dur = slowReadTracingThreshold
605-
}
606-
return dur
607-
}

sstable/block/category_stats.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,10 @@ type CategoryStats struct {
161161
// cache.
162162
BlockBytesInCache uint64
163163
// BlockReadDuration is the total duration to read the bytes not in the
164-
// cache, i.e., BlockBytes-BlockBytesInCache.
164+
// cache, i.e., BlockBytes-BlockBytesInCache. When multiple concurrent
165+
// readers wait for each other, and only one does the read, this will
166+
// account for the total time spent waiting plus potentially reading for all
167+
// those readers, so it can over count. Such over counting should be rare.
165168
BlockReadDuration time.Duration
166169
}
167170

0 commit comments

Comments
 (0)