Skip to content

Commit

Permalink
sstable: avoid caching meta blocks
Browse files Browse the repository at this point in the history
When opening a sstable for the first time, we read two 'meta' blocks: one
describes the layout of sstable, encoding block handles for the index block,
properties block, etc. The other contains table properties. These blocks are
decoded, and the necessary state is copied onto the heap, and then blocks are
released. In typical configurations with sufficiently large table caches, these
blocks are never read again or only much later after the table has been evicted
from the table cache.

This commit uses a BufferPool to hold these temporary blocks, and refrains from
adding these blocks to the block cache. This reduces contention on the block
cache mutexes (#1997).
  • Loading branch information
jbowens committed Jul 16, 2023
1 parent 03c97cd commit 809057a
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 17 deletions.
9 changes: 9 additions & 0 deletions sstable/buffer_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,15 @@ func (p *BufferPool) Init(initialSize int) {
}
}

// initPreallocated is like Init but for internal sstable package use in
// instances where a pre-allocated slice of []allocedBuffer already exists. It's
// used to avoid an extra allocation initializing BufferPool.pool.
func (p *BufferPool) initPreallocated(pool []allocedBuffer) {
*p = BufferPool{
pool: pool[:0],
}
}

// Release releases all buffers held by the pool and resets the pool to an
// uninitialized state.
func (p *BufferPool) Release() {
Expand Down
23 changes: 22 additions & 1 deletion sstable/reader.go
Original file line number Diff line number Diff line change
Expand Up @@ -3071,6 +3071,14 @@ type Reader struct {
rawTombstones bool
mergerOK bool
checksumType ChecksumType
// metaBufferPool is a buffer pool used exclusively when opening a table and
// loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate
// the BufferPool.pool slice as a part of the Reader allocation. It's
// capacity 3 to accommodate the meta block (1), and both the compressed
// properties block (1) and decompressed properties block (1)
// simultaneously.
metaBufferPool BufferPool
metaBufferPoolAlloc [3]allocedBuffer
}

// Close implements DB.Close, as documented in the pebble package.
Expand Down Expand Up @@ -3522,8 +3530,21 @@ func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) {
}

func (r *Reader) readMetaindex(metaindexBH BlockHandle) error {
// We use a BufferPool when reading metaindex blocks in order to avoid
// populating the block cache with these blocks. In heavy-write workloads,
// especially with high compaction concurrency, new tables may be created
// frequently. Populating the block cache with these metaindex blocks adds
// additional contention on the block cache mutexes (see #1997).
// Additionally, these blocks are exceedingly unlikely to be read again
// while they're still in the block cache except in misconfigurations with
// excessive sstables counts or a table cache that's far too small.
r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0])
// When we're finished, release the buffers we've allocated back to memory
// allocator. We don't expect to use metaBufferPool again.
defer r.metaBufferPool.Release()

b, err := r.readBlock(
context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, &r.metaBufferPool)
if err != nil {
return err
}
Expand Down
8 changes: 4 additions & 4 deletions testdata/event_listener
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,8 @@ compact 1 2.0 K 0 B 0 (size == esti
memtbl 1 256 K
zmemtbl 0 0 B
ztbl 0 0 B
bcache 8 1.2 K 11.1% (score == hit-rate)
tcache 1 680 B 40.0% (score == hit-rate)
bcache 6 1.1 K 11.1% (score == hit-rate)
tcache 1 800 B 40.0% (score == hit-rate)
snaps 0 - 0 (score == earliest seq num)
titers 0
filter - - 0.0% (score == utility)
Expand Down Expand Up @@ -378,8 +378,8 @@ compact 1 4.0 K 0 B 0 (size == esti
memtbl 1 512 K
zmemtbl 0 0 B
ztbl 0 0 B
bcache 16 2.5 K 14.3% (score == hit-rate)
tcache 1 680 B 50.0% (score == hit-rate)
bcache 12 2.3 K 14.3% (score == hit-rate)
tcache 1 800 B 50.0% (score == hit-rate)
snaps 0 - 0 (score == earliest seq num)
titers 0
filter - - 0.0% (score == utility)
Expand Down
4 changes: 2 additions & 2 deletions testdata/ingest
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ compact 0 0 B 0 B 0 (size == esti
memtbl 1 256 K
zmemtbl 0 0 B
ztbl 0 0 B
bcache 8 1.2 K 42.9% (score == hit-rate)
tcache 1 680 B 50.0% (score == hit-rate)
bcache 6 1.2 K 35.7% (score == hit-rate)
tcache 1 800 B 50.0% (score == hit-rate)
snaps 0 - 0 (score == earliest seq num)
titers 0
filter - - 0.0% (score == utility)
Expand Down
20 changes: 10 additions & 10 deletions testdata/metrics
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ compact 0 0 B 0 B 0 (size == esti
memtbl 1 256 K
zmemtbl 1 256 K
ztbl 0 0 B
bcache 4 560 B 0.0% (score == hit-rate)
tcache 1 680 B 0.0% (score == hit-rate)
bcache 3 528 B 0.0% (score == hit-rate)
tcache 1 800 B 0.0% (score == hit-rate)
snaps 0 - 0 (score == earliest seq num)
titers 1
filter - - 0.0% (score == utility)
Expand Down Expand Up @@ -82,8 +82,8 @@ compact 1 0 B 0 B 0 (size == esti
memtbl 1 256 K
zmemtbl 2 512 K
ztbl 2 1.2 K
bcache 7 1.1 K 42.9% (score == hit-rate)
tcache 2 1.3 K 66.7% (score == hit-rate)
bcache 5 1.0 K 42.9% (score == hit-rate)
tcache 2 1.6 K 66.7% (score == hit-rate)
snaps 0 - 0 (score == earliest seq num)
titers 2
filter - - 0.0% (score == utility)
Expand Down Expand Up @@ -116,8 +116,8 @@ compact 1 0 B 0 B 0 (size == esti
memtbl 1 256 K
zmemtbl 1 256 K
ztbl 2 1.2 K
bcache 7 1.1 K 42.9% (score == hit-rate)
tcache 2 1.3 K 66.7% (score == hit-rate)
bcache 5 1.0 K 42.9% (score == hit-rate)
tcache 2 1.6 K 66.7% (score == hit-rate)
snaps 0 - 0 (score == earliest seq num)
titers 2
filter - - 0.0% (score == utility)
Expand Down Expand Up @@ -147,8 +147,8 @@ compact 1 0 B 0 B 0 (size == esti
memtbl 1 256 K
zmemtbl 1 256 K
ztbl 1 633 B
bcache 4 560 B 42.9% (score == hit-rate)
tcache 1 680 B 66.7% (score == hit-rate)
bcache 3 528 B 42.9% (score == hit-rate)
tcache 1 800 B 66.7% (score == hit-rate)
snaps 0 - 0 (score == earliest seq num)
titers 1
filter - - 0.0% (score == utility)
Expand Down Expand Up @@ -375,8 +375,8 @@ compact 2 4.8 K 0 B 0 (size == esti
memtbl 1 1.0 M
zmemtbl 0 0 B
ztbl 0 0 B
bcache 16 2.4 K 34.4% (score == hit-rate)
tcache 3 2.0 K 57.9% (score == hit-rate)
bcache 12 2.3 K 31.1% (score == hit-rate)
tcache 3 2.3 K 57.9% (score == hit-rate)
snaps 0 - 0 (score == earliest seq num)
titers 0
filter - - 0.0% (score == utility)
Expand Down

0 comments on commit 809057a

Please sign in to comment.