Skip to content

Commit 527eebf

Browse files
committed
sstable: lazy load the index block in single level iterator
This commit implements the index block lazy loading in single-level iterator as default behavior. The index block loading is now deferred until first access. At the same time, it leads to semantic changes as the file reading ordering changes because of this. Here is a quick overview of the change. Before (eager loading) 1. Iterator Construction: 1.1 the constructor calls readTopLevelIndexBlock() 1.2 readTopLevelIndexBlock() calls readIndexBlock() 1.3 readIndexBlock(() calls blockReader.Read() 1.4 blockReader.Read() calls objstorage ReadHandle ReadAt() 1.5 ReadAt() loads the index block 1.6 construction completes 2. First Positioning (e.g., First(), SeekGE(), SeekPrefixGE()): 2.1 the positioning function uses the already-loaded index via PI(&i.index) 2.2 file opened for data read 2.3 ReadAt() loads the data block After (lazy loading) 1. Iterator Construction: 1.1 the constructor sets indexLoaded = false to defer loading 1.2 construction completes (no I/O) 2. First Positioning (e.g., First(), SeekGE(), SeekPrefixGE()): 2.1 the positioning function calls the new indexIter() wrapper 2.2 indexIter() calls ensureIndexLoaded() 2.3 ensureIndexLoaded() calls ReadAt() 2.4 ReadAt() loads the index block 2.5 uses the newly-loaded index for positioning 2.6 ReadAt() loads the data block As seen above, the I/O operation is now deferred. While the overall behavior is transparent to users, the internal semantic is different now. And the data-driven tests have to change accordingly. See the order of events change in file: checkpoint, cleaner, and event_listener. What's interesting (optimized) is the change observed in flushable_ingest, which demonstrate the I/O saving thanks to the optimization. ``` -# When the key doesn't pass the bloom filter, we should see only two block -# reads. +# When the key doesn't pass the bloom filter, we should see only one block +# read due to lazy loading optimization - bloom filter is checked first, +# and if it rejects the key, we can avoid loading the index block entirely. get with-fs-logging small-00001-does-not-exist ---- -read-at(158, 41): 000004.sst read-at(199, 74): 000004.sst ``` implements #3248
1 parent c40280e commit 527eebf

File tree

7 files changed

+102
-31
lines changed

7 files changed

+102
-31
lines changed

sstable/reader_iter_single_lvl.go

Lines changed: 82 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,9 @@ type singleLevelIterator[I any, PI indexBlockIterator[I], D any, PD dataBlockIte
165165
useFilterBlock bool
166166
lastBloomFilterMatched bool
167167

168+
// Lazy loading flag
169+
indexLoaded bool
170+
168171
transforms IterTransforms
169172

170173
// All fields above this field are cleared when resetting the iterator for reuse.
@@ -214,14 +217,9 @@ func newColumnBlockSingleLevelIterator(
214217
i.vbRH = r.blockReader.UsePreallocatedReadHandle(objstorage.NoReadBefore, &i.vbRHPrealloc)
215218
}
216219
i.data.InitOnce(r.keySchema, r.Comparer, &i.internalValueConstructor)
217-
indexH, err := r.readTopLevelIndexBlock(ctx, i.readEnv.Block, i.indexFilterRH)
218-
if err == nil {
219-
err = i.index.InitHandle(r.Comparer, indexH, opts.Transforms)
220-
}
221-
if err != nil {
222-
_ = i.Close()
223-
return nil, err
224-
}
220+
221+
// Use lazy loading by default - index will be loaded on first access
222+
i.indexLoaded = false
225223
return i, nil
226224
}
227225

@@ -254,14 +252,8 @@ func newRowBlockSingleLevelIterator(
254252
i.data.SetHasValuePrefix(true)
255253
}
256254

257-
indexH, err := r.readTopLevelIndexBlock(ctx, i.readEnv.Block, i.indexFilterRH)
258-
if err == nil {
259-
err = i.index.InitHandle(r.Comparer, indexH, opts.Transforms)
260-
}
261-
if err != nil {
262-
_ = i.Close()
263-
return nil, err
264-
}
255+
// Use lazy loading by default - index will be loaded on first access
256+
i.indexLoaded = false
265257
return i, nil
266258
}
267259

@@ -449,7 +441,7 @@ func (i *singleLevelIterator[I, PI, P, PD]) SetContext(ctx context.Context) {
449441
// unpositioned. If unsuccessful, it sets i.err to any error encountered, which
450442
// may be nil if we have simply exhausted the entire table.
451443
func (i *singleLevelIterator[I, PI, P, PD]) loadDataBlock(dir int8) loadBlockResult {
452-
if !PI(&i.index).Valid() {
444+
if i.err != nil || !PI(&i.index).Valid() {
453445
// Ensure the data block iterator is invalidated even if loading of the block
454446
// fails.
455447
PD(&i.data).Invalidate()
@@ -521,6 +513,12 @@ func (i *singleLevelIterator[I, PI, D, PD]) ReadValueBlock(
521513
// apprioriate bound, depending on the iteration direction, and returns either
522514
// `blockIntersects` or `blockExcluded`.
523515
func (i *singleLevelIterator[I, PI, D, PD]) resolveMaybeExcluded(dir int8) intersectsResult {
516+
if !i.indexLoaded {
517+
if err := i.ensureIndexLoaded(); err != nil {
518+
i.err = err
519+
return blockExcluded
520+
}
521+
}
524522
// TODO(jackson): We could first try comparing to top-level index block's
525523
// key, and if within bounds avoid per-data block key comparisons.
526524

@@ -682,6 +680,13 @@ func (i *singleLevelIterator[I, PI, D, PD]) SeekGE(
682680
func (i *singleLevelIterator[I, PI, D, PD]) seekGEHelper(
683681
key []byte, boundsCmp int, flags base.SeekGEFlags,
684682
) *base.InternalKV {
683+
if !i.indexLoaded {
684+
if err := i.ensureIndexLoaded(); err != nil {
685+
i.err = err
686+
return nil
687+
}
688+
}
689+
685690
// Invariant: trySeekUsingNext => !i.data.isDataInvalidated() && i.exhaustedBounds != +1
686691

687692
// SeekGE performs various step-instead-of-seeking optimizations: eg enabled
@@ -818,6 +823,7 @@ func (i *singleLevelIterator[I, PI, D, PD]) seekPrefixGE(
818823
flags = flags.DisableTrySeekUsingNext()
819824
}
820825
i.lastBloomFilterMatched = false
826+
821827
// Check prefix bloom filter.
822828
var mayContain bool
823829
mayContain, i.err = i.bloomFilterMayContain(prefix)
@@ -922,6 +928,13 @@ func (i *singleLevelIterator[I, PI, D, PD]) virtualLastSeekLE() *base.InternalKV
922928
i.boundsCmp = 0
923929
i.positionedUsingLatestBounds = true
924930

931+
if !i.indexLoaded {
932+
if err := i.ensureIndexLoaded(); err != nil {
933+
i.err = err
934+
return nil
935+
}
936+
}
937+
925938
indexOk := PI(&i.index).SeekGE(key)
926939
// We can have multiple internal keys with the same user key as the seek
927940
// key. In that case, we want the last (greatest) internal key.
@@ -993,6 +1006,12 @@ func (i *singleLevelIterator[I, PI, D, PD]) virtualLastSeekLE() *base.InternalKV
9931006
func (i *singleLevelIterator[I, PI, D, PD]) SeekLT(
9941007
key []byte, flags base.SeekLTFlags,
9951008
) *base.InternalKV {
1009+
if !i.indexLoaded {
1010+
if err := i.ensureIndexLoaded(); err != nil {
1011+
i.err = err
1012+
return nil
1013+
}
1014+
}
9961015
if i.readEnv.Virtual != nil {
9971016
// Might have to fix upper bound since virtual sstable bounds are not
9981017
// known to callers of SeekLT.
@@ -1116,6 +1135,12 @@ func (i *singleLevelIterator[I, PI, D, PD]) First() *base.InternalKV {
11161135
// index file. For the latter, one cannot make any claims about absolute
11171136
// positioning.
11181137
func (i *singleLevelIterator[I, PI, D, PD]) firstInternal() *base.InternalKV {
1138+
if !i.indexLoaded {
1139+
if err := i.ensureIndexLoaded(); err != nil {
1140+
i.err = err
1141+
return nil
1142+
}
1143+
}
11191144
i.exhaustedBounds = 0
11201145
i.err = nil // clear cached iteration error
11211146
// Seek optimization only applies until iterator is first positioned after SetBounds.
@@ -1180,6 +1205,12 @@ func (i *singleLevelIterator[I, PI, D, PD]) Last() *base.InternalKV {
11801205
// index file. For the latter, one cannot make any claims about absolute
11811206
// positioning.
11821207
func (i *singleLevelIterator[I, PI, D, PD]) lastInternal() *base.InternalKV {
1208+
if !i.indexLoaded {
1209+
if err := i.ensureIndexLoaded(); err != nil {
1210+
i.err = err
1211+
return nil
1212+
}
1213+
}
11831214
i.exhaustedBounds = 0
11841215
i.err = nil // clear cached iteration error
11851216
// Seek optimization only applies until iterator is first positioned after SetBounds.
@@ -1249,6 +1280,12 @@ func (i *singleLevelIterator[I, PI, D, PD]) Next() *base.InternalKV {
12491280

12501281
// NextPrefix implements (base.InternalIterator).NextPrefix.
12511282
func (i *singleLevelIterator[I, PI, D, PD]) NextPrefix(succKey []byte) *base.InternalKV {
1283+
if !i.indexLoaded {
1284+
if err := i.ensureIndexLoaded(); err != nil {
1285+
i.err = err
1286+
return nil
1287+
}
1288+
}
12521289
if i.exhaustedBounds == +1 {
12531290
panic("NextPrefix called even though exhausted upper bound")
12541291
}
@@ -1343,6 +1380,12 @@ func (i *singleLevelIterator[I, PI, D, PD]) Prev() *base.InternalKV {
13431380
}
13441381

13451382
func (i *singleLevelIterator[I, PI, D, PD]) skipForward() *base.InternalKV {
1383+
if !i.indexLoaded {
1384+
if err := i.ensureIndexLoaded(); err != nil {
1385+
i.err = err
1386+
return nil
1387+
}
1388+
}
13461389
for {
13471390
if !PI(&i.index).Next() {
13481391
PD(&i.data).Invalidate()
@@ -1421,6 +1464,12 @@ func (i *singleLevelIterator[I, PI, D, PD]) skipForward() *base.InternalKV {
14211464
}
14221465

14231466
func (i *singleLevelIterator[I, PI, D, PD]) skipBackward() *base.InternalKV {
1467+
if !i.indexLoaded {
1468+
if err := i.ensureIndexLoaded(); err != nil {
1469+
i.err = err
1470+
return nil
1471+
}
1472+
}
14241473
for {
14251474
if !PI(&i.index).Prev() {
14261475
PD(&i.data).Invalidate()
@@ -1514,6 +1563,8 @@ func (i *singleLevelIterator[I, PI, D, PD]) closeInternal() error {
15141563
}
15151564
var err error
15161565
err = firstError(err, PD(&i.data).Close())
1566+
// Always close index iterator unconditionally to avoid BufferPool panic
1567+
// Even if lazy loading wasn't used, the index might have been initialized
15171568
err = firstError(err, PI(&i.index).Close())
15181569
if i.indexFilterRH != nil {
15191570
err = firstError(err, i.indexFilterRH.Close())
@@ -1532,6 +1583,7 @@ func (i *singleLevelIterator[I, PI, D, PD]) closeInternal() error {
15321583
err = firstError(err, i.vbRH.Close())
15331584
i.vbRH = nil
15341585
}
1586+
i.indexLoaded = false
15351587
return err
15361588
}
15371589

@@ -1546,3 +1598,16 @@ func (i *singleLevelIterator[I, PI, D, PD]) String() string {
15461598
func (i *singleLevelIterator[I, PI, D, PD]) DebugTree(tp treeprinter.Node) {
15471599
tp.Childf("%T(%p) fileNum=%s", i, i, i.String())
15481600
}
1601+
1602+
func (i *singleLevelIterator[I, PI, D, PD]) ensureIndexLoaded() error {
1603+
indexH, err := i.reader.readTopLevelIndexBlock(i.ctx, i.readEnv.Block, i.indexFilterRH)
1604+
if err == nil {
1605+
err = PI(&i.index).InitHandle(i.reader.Comparer, indexH, i.transforms)
1606+
}
1607+
if err != nil {
1608+
return err
1609+
}
1610+
1611+
i.indexLoaded = true
1612+
return nil
1613+
}

sstable/reader_iter_test.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,20 @@ func TestIteratorErrorOnInit(t *testing.T) {
5353
defer toggle.Off()
5454

5555
var stats base.InternalIteratorStats
56-
for k := 0; k < 20; k++ {
56+
for range 20 {
5757
if rand.IntN(2) == 0 {
58-
_, err := newRowBlockSingleLevelIterator(context.Background(), r, IterOptions{
58+
iter, err := newRowBlockSingleLevelIterator(context.Background(), r, IterOptions{
5959
Transforms: NoTransforms,
6060
FilterBlockSizeLimit: NeverUseFilterBlock,
6161
Env: ReadEnv{Block: block.ReadEnv{Stats: &stats, BufferPool: &pool}},
6262
ReaderProvider: MakeTrivialReaderProvider(r),
6363
})
64-
require.Error(t, err)
64+
// Single-level iterators use lazy loading - creation succeeds but first access fails
65+
require.NoError(t, err)
66+
// Error should surface when trying to use the iterator
67+
_ = iter.First()
68+
require.Error(t, iter.Error())
69+
iter.Close()
6570
} else {
6671
_, err := newRowBlockTwoLevelIterator(context.Background(), r, IterOptions{
6772
Transforms: NoTransforms,

sstable/reader_iter_two_lvl.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ func (i *twoLevelIterator[I, PI, D, PD]) loadSecondLevelIndexBlock(dir int8) loa
7878
i.secondLevel.err = err
7979
return loadBlockFailed
8080
}
81+
i.secondLevel.indexLoaded = true
8182
return loadBlockOK
8283
}
8384

testdata/checkpoint

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -261,16 +261,16 @@ open: db/000007.sst (options: *vfs.randomReadsOption)
261261
read-at(700, 61): db/000007.sst
262262
read-at(649, 51): db/000007.sst
263263
read-at(132, 517): db/000007.sst
264-
read-at(91, 41): db/000005.sst
265264
open: db/000005.sst (options: *vfs.sequentialReadsOption)
265+
read-at(91, 41): db/000005.sst
266266
read-at(0, 91): db/000005.sst
267-
read-at(91, 41): db/000007.sst
268267
open: db/000007.sst (options: *vfs.sequentialReadsOption)
268+
read-at(91, 41): db/000007.sst
269269
read-at(0, 91): db/000007.sst
270270
create: db/000010.sst
271271
close: db/000005.sst
272-
read-at(95, 41): db/000009.sst
273272
open: db/000009.sst (options: *vfs.sequentialReadsOption)
273+
read-at(95, 41): db/000009.sst
274274
read-at(0, 95): db/000009.sst
275275
close: db/000007.sst
276276
close: db/000009.sst

testdata/cleaner

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,13 +82,13 @@ open: db/000007.sst (options: *vfs.randomReadsOption)
8282
read-at(501, 61): db/000007.sst
8383
read-at(472, 37): db/000007.sst
8484
read-at(53, 419): db/000007.sst
85-
read-at(52, 27): db/000005.sst
8685
open: db/000005.sst (options: *vfs.sequentialReadsOption)
86+
read-at(52, 27): db/000005.sst
8787
read-at(0, 52): db/000005.sst
8888
create: db/000008.sst
8989
close: db/000005.sst
90-
read-at(26, 27): db/000007.sst
9190
open: db/000007.sst (options: *vfs.sequentialReadsOption)
91+
read-at(26, 27): db/000007.sst
9292
read-at(0, 26): db/000007.sst
9393
close: db/000007.sst
9494
sync-data: db/000008.sst

testdata/event_listener

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,11 @@ open: db/000008.sst (options: *vfs.randomReadsOption)
145145
read-at(686, 61): db/000008.sst
146146
read-at(636, 50): db/000008.sst
147147
read-at(119, 517): db/000008.sst
148-
read-at(78, 41): db/000005.sst
149148
open: db/000005.sst (options: *vfs.sequentialReadsOption)
149+
read-at(78, 41): db/000005.sst
150150
read-at(0, 78): db/000005.sst
151-
read-at(78, 41): db/000008.sst
152151
open: db/000008.sst (options: *vfs.sequentialReadsOption)
152+
read-at(78, 41): db/000008.sst
153153
read-at(0, 78): db/000008.sst
154154
close: db/000008.sst
155155
close: db/000005.sst

testdata/flushable_ingest

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -701,17 +701,17 @@ allowFlush
701701
get with-fs-logging
702702
small-00001
703703
----
704-
read-at(158, 41): 000004.sst
705704
read-at(199, 74): 000004.sst
705+
read-at(158, 41): 000004.sst
706706
read-at(0, 158): 000004.sst
707707
small-00001:val-00001
708708

709-
# When the key doesn't pass the bloom filter, we should see only two block
710-
# reads.
709+
# When the key doesn't pass the bloom filter, we should see only one block
710+
# read due to lazy loading optimization - bloom filter is checked first,
711+
# and if it rejects the key, we can avoid loading the index block entirely.
711712
get with-fs-logging
712713
small-00001-does-not-exist
713714
----
714-
read-at(158, 41): 000004.sst
715715
read-at(199, 74): 000004.sst
716716
small-00001-does-not-exist: pebble: not found
717717

0 commit comments

Comments
 (0)