Skip to content

Commit 0c63ea8

Browse files
committed
metrics: more granular block cache metrics
We improve the cache metrics in two ways: - we break down the misses by the type of block (for the most important block types) - we also show miss rates across the last 10 minutes and the last hour In follow-up work, I will attempt to plumb the LSM level as well.
1 parent 96bb758 commit 0c63ea8

21 files changed

+581
-204
lines changed

db_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -858,7 +858,7 @@ func TestMemTableReservation(t *testing.T) {
858858
t.Fatalf("expected 2 refs, but found %d", refs)
859859
}
860860
// Verify the memtable reservation has caused our test block to be evicted.
861-
if cv := tmpHandle.Peek(base.DiskFileNum(0), 0); cv != nil {
861+
if cv := tmpHandle.Peek(base.DiskFileNum(0), 0, cache.CategoryBackground); cv != nil {
862862
t.Fatalf("expected failure, but found success: %#v", cv)
863863
}
864864

internal/cache/cache.go

Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,9 @@ import (
1717

1818
"github.com/cockroachdb/pebble/internal/base"
1919
"github.com/cockroachdb/pebble/internal/invariants"
20+
"github.com/cockroachdb/pebble/internal/metricsutil"
2021
)
2122

22-
// Metrics holds metrics for the cache.
23-
type Metrics struct {
24-
// The number of bytes inuse by the cache.
25-
Size int64
26-
// The count of objects (blocks or tables) in the cache.
27-
Count int64
28-
// The number of cache hits.
29-
Hits int64
30-
// The number of cache misses.
31-
Misses int64
32-
}
33-
3423
// Cache implements Pebble's sharded block cache. The Clock-PRO algorithm is
3524
// used for page replacement
3625
// (http://static.usenix.org/event/usenix05/tech/general/full_papers/jiang/jiang_html/html.html). In
@@ -78,6 +67,8 @@ type Cache struct {
7867
idAlloc atomic.Uint64
7968
shards []shard
8069

70+
metricsWindow *metricsutil.Window[HitsAndMisses]
71+
8172
// Traces recorded by Cache.trace. Used for debugging.
8273
tr struct {
8374
sync.Mutex
@@ -136,6 +127,8 @@ func NewWithShards(size int64, shards int) *Cache {
136127
for i := range c.shards {
137128
c.shards[i].init(size / int64(len(c.shards)))
138129
}
130+
c.metricsWindow = metricsutil.NewWindow[HitsAndMisses](c.hitsAndMisses)
131+
c.metricsWindow.Start()
139132

140133
// Note: this is a no-op if invariants are disabled or race is enabled.
141134
invariants.SetFinalizer(c, func(c *Cache) {
@@ -171,10 +164,19 @@ func (c *Cache) Unref() {
171164
case v < 0:
172165
panic(fmt.Sprintf("pebble: inconsistent reference count: %d", v))
173166
case v == 0:
174-
for i := range c.shards {
175-
c.shards[i].Free()
176-
}
167+
c.destroy()
168+
}
169+
}
170+
171+
func (c *Cache) destroy() {
172+
if c.metricsWindow != nil {
173+
c.metricsWindow.Stop()
174+
c.metricsWindow = nil
175+
}
176+
for i := range c.shards {
177+
c.shards[i].Free()
177178
}
179+
c.shards = nil
178180
}
179181

180182
func (c *Cache) NewHandle() *Handle {
@@ -207,21 +209,6 @@ func (c *Cache) Reserve(n int) func() {
207209
}
208210
}
209211

210-
// Metrics returns the metrics for the cache.
211-
func (c *Cache) Metrics() Metrics {
212-
var m Metrics
213-
for i := range c.shards {
214-
s := &c.shards[i]
215-
s.mu.RLock()
216-
m.Count += int64(s.blocks.Len())
217-
m.Size += s.sizeHot + s.sizeCold
218-
s.mu.RUnlock()
219-
m.Hits += s.hits.Load()
220-
m.Misses += s.misses.Load()
221-
}
222-
return m
223-
}
224-
225212
// MaxSize returns the max size of the cache.
226213
func (c *Cache) MaxSize() int64 {
227214
return c.maxSize
@@ -262,16 +249,16 @@ func (c *Handle) Cache() *Cache {
262249
// Peek retrieves the cache value for the specified file and offset, returning
263250
// nil if no value is present. Peek does not affect the state of the cache (it
264251
// does not "count" as an access as far as the cache replacement is concerned).
265-
func (c *Handle) Peek(fileNum base.DiskFileNum, offset uint64) *Value {
252+
func (c *Handle) Peek(fileNum base.DiskFileNum, offset uint64, category Category) *Value {
266253
k := makeKey(c.id, fileNum, offset)
267-
return c.cache.getShard(k).get(k, true /* peekOnly */)
254+
return c.cache.getShard(k).get(k, category, true /* peekOnly */)
268255
}
269256

270257
// Get retrieves the cache value for the specified file and offset, returning
271258
// nil if no value is present.
272-
func (c *Handle) Get(fileNum base.DiskFileNum, offset uint64) *Value {
259+
func (c *Handle) Get(fileNum base.DiskFileNum, offset uint64, category Category) *Value {
273260
k := makeKey(c.id, fileNum, offset)
274-
return c.cache.getShard(k).get(k, false /* peekOnly */)
261+
return c.cache.getShard(k).get(k, category, false /* peekOnly */)
275262
}
276263

277264
// GetWithReadHandle retrieves the cache value for the specified handleID, fileNum
@@ -298,7 +285,7 @@ func (c *Handle) Get(fileNum base.DiskFileNum, offset uint64) *Value {
298285
// While waiting, someone else may successfully read the value, which results
299286
// in a valid Handle being returned. This is a case where cacheHit=false.
300287
func (c *Handle) GetWithReadHandle(
301-
ctx context.Context, fileNum base.DiskFileNum, offset uint64,
288+
ctx context.Context, fileNum base.DiskFileNum, offset uint64, category Category,
302289
) (
303290
cv *Value,
304291
rh ReadHandle,
@@ -308,7 +295,7 @@ func (c *Handle) GetWithReadHandle(
308295
err error,
309296
) {
310297
k := makeKey(c.id, fileNum, offset)
311-
cv, re := c.cache.getShard(k).getWithReadEntry(k)
298+
cv, re := c.cache.getShard(k).getWithReadEntry(k, category)
312299
if cv != nil {
313300
return cv, ReadHandle{}, 0, 0, true, nil
314301
}

internal/cache/cache_test.go

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func TestCache(t *testing.T) {
4141
wantHit := fields[1][0] == 'h'
4242

4343
var hit bool
44-
cv := h.Get(base.DiskFileNum(key), 0)
44+
cv := h.Get(base.DiskFileNum(key), 0, CategorySSTableData)
4545
if cv == nil {
4646
cv = Alloc(1)
4747
cv.RawBuffer()[0] = fields[0][0]
@@ -81,14 +81,14 @@ func TestCachePeek(t *testing.T) {
8181
setTestValue(h, 0, uint64(i), "a", 1)
8282
}
8383
for i := range size / 2 {
84-
v := h.Get(base.DiskFileNum(0), uint64(i))
84+
v := h.Get(base.DiskFileNum(0), uint64(i), CategoryBackground)
8585
if v == nil {
8686
t.Fatalf("expected to find block %d", i)
8787
}
8888
v.Release()
8989
}
9090
for i := size / 2; i < size; i++ {
91-
v := h.Peek(base.DiskFileNum(0), uint64(i))
91+
v := h.Peek(base.DiskFileNum(0), uint64(i), CategoryBackground)
9292
if v == nil {
9393
t.Fatalf("expected to find block %d", i)
9494
}
@@ -100,7 +100,7 @@ func TestCachePeek(t *testing.T) {
100100
}
101101
// Verify that the Gets still find their values, despite the Peeks.
102102
for i := range size / 2 {
103-
v := h.Get(base.DiskFileNum(0), uint64(i))
103+
v := h.Get(base.DiskFileNum(0), uint64(i), CategoryBackground)
104104
if v == nil {
105105
t.Fatalf("expected to find block %d", i)
106106
}
@@ -124,12 +124,12 @@ func TestCacheDelete(t *testing.T) {
124124
if expected, size := int64(10), cache.Size(); expected != size {
125125
t.Fatalf("expected cache size %d, but found %d", expected, size)
126126
}
127-
if v := h.Get(base.DiskFileNum(0), 0); v == nil {
127+
if v := h.Get(base.DiskFileNum(0), 0, CategorySSTableData); v == nil {
128128
t.Fatalf("expected to find block 0/0")
129129
} else {
130130
v.Release()
131131
}
132-
if v := h.Get(base.DiskFileNum(1), 0); v != nil {
132+
if v := h.Get(base.DiskFileNum(1), 0, CategorySSTableData); v != nil {
133133
t.Fatalf("expected to not find block 1/0")
134134
}
135135
// Deleting a non-existing block does nothing.
@@ -196,11 +196,11 @@ func TestMultipleDBs(t *testing.T) {
196196
if expected, size := int64(5), cache.Size(); expected != size {
197197
t.Fatalf("expected cache size %d, but found %d", expected, size)
198198
}
199-
v := h1.Get(base.DiskFileNum(0), 0)
199+
v := h1.Get(base.DiskFileNum(0), 0, CategorySSTableData)
200200
if v != nil {
201201
t.Fatalf("expected not present, but found %#v", v)
202202
}
203-
v = h2.Get(base.DiskFileNum(0), 0)
203+
v = h2.Get(base.DiskFileNum(0), 0, CategorySSTableData)
204204
if v := v.RawBuffer(); string(v) != "bbbbb" {
205205
t.Fatalf("expected bbbbb, but found %s", v)
206206
}
@@ -306,8 +306,10 @@ func BenchmarkCacheGet(b *testing.B) {
306306
b.RunParallel(func(pb *testing.PB) {
307307
pcg := rand.NewPCG(rand.Uint64(), rand.Uint64())
308308
for pb.Next() {
309-
offset := pcg.Uint64() % size
310-
v := h.Get(base.DiskFileNum(0), offset)
309+
randVal := pcg.Uint64()
310+
offset := randVal % size
311+
category := Category((randVal >> 32) % uint64(NumCategories))
312+
v := h.Get(base.DiskFileNum(0), offset, category)
311313
if v == nil {
312314
b.Fatal("failed to look up value")
313315
}

internal/cache/clockpro.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,13 @@ func (k key) String() string {
7676
return fmt.Sprintf("%d/%d/%d", k.id, k.fileNum, k.offset)
7777
}
7878

79-
type shard struct {
79+
type counters [NumCategories]struct {
8080
hits atomic.Int64
8181
misses atomic.Int64
82+
}
83+
84+
type shard struct {
85+
counters counters
8286

8387
mu sync.RWMutex
8488

@@ -134,7 +138,7 @@ func (c *shard) init(maxSize int64) {
134138
//
135139
// If peekOnly is true, the state of the cache is not modified to reflect the
136140
// access.
137-
func (c *shard) get(k key, peekOnly bool) *Value {
141+
func (c *shard) get(k key, category Category, peekOnly bool) *Value {
138142
c.mu.RLock()
139143
if e, _ := c.blocks.Get(k); e != nil {
140144
if value := e.acquireValue(); value != nil {
@@ -143,12 +147,12 @@ func (c *shard) get(k key, peekOnly bool) *Value {
143147
e.referenced.Store(true)
144148
}
145149
c.mu.RUnlock()
146-
c.hits.Add(1)
150+
c.counters[category].hits.Add(1)
147151
return value
148152
}
149153
}
150154
c.mu.RUnlock()
151-
c.misses.Add(1)
155+
c.counters[category].misses.Add(1)
152156
return nil
153157
}
154158

@@ -157,7 +161,7 @@ func (c *shard) get(k key, peekOnly bool) *Value {
157161
// is not in the cache (nil Value), a non-nil readEntry is returned (in which
158162
// case the caller is responsible to dereference the entry, via one of
159163
// unrefAndTryRemoveFromMap(), setReadValue(), setReadError()).
160-
func (c *shard) getWithReadEntry(k key) (*Value, *readEntry) {
164+
func (c *shard) getWithReadEntry(k key, category Category) (*Value, *readEntry) {
161165
c.mu.RLock()
162166
if e, _ := c.blocks.Get(k); e != nil {
163167
if value := e.acquireValue(); value != nil {
@@ -166,13 +170,13 @@ func (c *shard) getWithReadEntry(k key) (*Value, *readEntry) {
166170
e.referenced.Store(true)
167171
}
168172
c.mu.RUnlock()
169-
c.hits.Add(1)
173+
c.counters[category].hits.Add(1)
170174
return value, nil
171175
}
172176
}
173177
re := c.readShard.acquireReadEntry(k)
174178
c.mu.RUnlock()
175-
c.misses.Add(1)
179+
c.counters[category].misses.Add(1)
176180
return nil, re
177181
}
178182

internal/cache/metrics.go

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
// Copyright 2025 The LevelDB-Go and Pebble Authors. All rights reserved. Use
2+
// of this source code is governed by a BSD-style license that can be found in
3+
// the LICENSE file.
4+
5+
package cache
6+
7+
import (
8+
"fmt"
9+
10+
"github.com/cockroachdb/crlib/crtime"
11+
)
12+
13+
// Category is used to maintain granular cache hit/miss statistics.
14+
type Category uint8
15+
16+
const (
17+
// CategoryBackground is used for cache accesses made by compactions or
18+
// downloads.
19+
CategoryBackground Category = iota
20+
CategorySSTableData
21+
CategorySSTableValue
22+
CategoryBlobValue
23+
CategoryFilter
24+
// CategoryIndex includes index blocks and other metadata blocks (for both
25+
// sstables and blob files).
26+
CategoryIndex
27+
28+
// Categories can be used with the range keyword.
29+
Categories
30+
)
31+
32+
const NumCategories = int(Categories)
33+
34+
func (c Category) String() string {
35+
switch c {
36+
case CategoryBackground:
37+
return "background"
38+
case CategorySSTableData:
39+
return "sstdata"
40+
case CategorySSTableValue:
41+
return "sstval"
42+
case CategoryBlobValue:
43+
return "blobval"
44+
case CategoryFilter:
45+
return "filter"
46+
case CategoryIndex:
47+
return "index"
48+
default:
49+
return fmt.Sprintf("invalid(%d)", c)
50+
}
51+
}
52+
53+
// Metrics holds metrics for the cache.
54+
type Metrics struct {
55+
// Hits and misse since the cache was created.
56+
HitsAndMisses HitsAndMisses
57+
// The current number of bytes inuse by the cache.
58+
Size int64
59+
// The current count of objects (blocks or tables) in the cache.
60+
Count int64
61+
// Recent contains cache hit metrics covering two recent periods (last ~10
62+
// minutes and last ~1 hour).
63+
Recent [2]struct {
64+
HitsAndMisses
65+
Since crtime.Mono
66+
}
67+
}
68+
69+
// HitsAndMisses contains the number of cache hits and misses across a period of
70+
// time.
71+
type HitsAndMisses [NumCategories]struct {
72+
Hits int64
73+
Misses int64
74+
}
75+
76+
// Aggregate returns the total hits and misses across all categories.
77+
func (hm *HitsAndMisses) Aggregate() (hits, misses int64) {
78+
for i := range *hm {
79+
hits += hm[i].Hits
80+
misses += hm[i].Misses
81+
}
82+
return hits, misses
83+
}
84+
85+
// ToRecent changes the receiver to reflect recent hits and misses, given the
86+
// current metrics.
87+
// At a high level, hm.ToRecent(current) means hm = current - hm.
88+
func (hm *HitsAndMisses) ToRecent(current *HitsAndMisses) {
89+
for i := range *hm {
90+
hm[i].Hits = current[i].Hits - hm[i].Hits
91+
hm[i].Misses = current[i].Misses - hm[i].Misses
92+
}
93+
}
94+
95+
// Metrics returns the current metrics for the cache.
96+
func (c *Cache) Metrics() Metrics {
97+
var m Metrics
98+
m.HitsAndMisses = c.hitsAndMisses()
99+
for i := range c.shards {
100+
s := &c.shards[i]
101+
s.mu.RLock()
102+
m.Count += int64(s.blocks.Len())
103+
m.Size += s.sizeHot + s.sizeCold
104+
s.mu.RUnlock()
105+
}
106+
m.Recent[0].HitsAndMisses, m.Recent[0].Since = c.metricsWindow.TenMinutesAgo()
107+
m.Recent[1].HitsAndMisses, m.Recent[1].Since = c.metricsWindow.OneHourAgo()
108+
for i := range m.Recent {
109+
m.Recent[i].ToRecent(&m.HitsAndMisses)
110+
}
111+
return m
112+
}
113+
114+
func (c *Cache) hitsAndMisses() HitsAndMisses {
115+
var hm HitsAndMisses
116+
for i := range c.shards {
117+
shardCounters := &c.shards[i].counters
118+
for j := range hm {
119+
hm[j].Hits += shardCounters[j].hits.Load()
120+
hm[j].Misses += shardCounters[j].misses.Load()
121+
}
122+
}
123+
return hm
124+
}

0 commit comments

Comments
 (0)