Skip to content

Commit 64f938f

Browse files
committed
manifest: pass added tables map directly to addL0Files
The `addL0Files` method now takes the added tables as a map. It internally builds the `files` slice it needs while verifying a precondition for the function - that the added tables sort last in the L0 level (which is usually the case, but not always because of ingestions).
1 parent 38418f3 commit 64f938f

File tree

5 files changed

+86
-93
lines changed

5 files changed

+86
-93
lines changed

compaction_picker.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,8 @@ func (pc *pickedCompaction) clone() *pickedCompaction {
358358
// TODO(msbutler): properly clone picker metrics
359359
pickerMetrics: pc.pickerMetrics,
360360

361-
// Both copies see the same manifest, therefore, it's ok for them to se
362-
// share the same pc.version and pc.l0Organizer.
361+
// Both copies see the same manifest, therefore, it's ok for them to share
362+
// the same pc.version and pc.l0Organizer.
363363
version: pc.version,
364364
l0Organizer: pc.l0Organizer,
365365
}
@@ -672,8 +672,16 @@ func totalCompensatedSize(iter iter.Seq[*manifest.TableMetadata]) uint64 {
672672
// compaction picker is associated with a single version. A new compaction
673673
// picker is created and initialized every time a new version is installed.
674674
type compactionPickerByScore struct {
675-
opts *Options
676-
vers *version
675+
opts *Options
676+
vers *version
677+
// Unlike vers, which is immutable and the latest version when this picker is
678+
// created, l0Organizer represents the mutable L0 state of the latest version.
679+
// This means that at some point in the future a compactionPickerByScore
680+
// created in the past will have mutually inconsistent state in vers and
681+
// l0Organizer. This is not a problem since (a) a new picker is created in
682+
// logAndApply when a new version is installed, and (b) only the latest picker
683+
// is used for picking compactions. This is ensured by holding
684+
// versionSet.logLock for both (a) and (b).
677685
l0Organizer *manifest.L0Organizer
678686
virtualBackings *manifest.VirtualBackings
679687
// The level to target for L0 compactions. Levels L1 to baseLevel must be

internal/manifest/l0_sublevels.go

Lines changed: 68 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -410,69 +410,67 @@ func mergeIntervals(
410410
}
411411

412412
// addL0Files incrementally builds a new l0Sublevels for when the only change
413-
// since the receiver l0Sublevels was an addition of the specified files, with
413+
// since the receiver l0Sublevels was an addition of the specified tables, with
414414
// no L0 deletions. The common case of this is an ingestion or a flush. These
415415
// files can "sit on top" of existing sublevels, creating at most one new
416416
// sublevel for a flush (and possibly multiple for an ingestion), and at most
417417
// 2*len(files) additions to s.orderedIntervals. No files must have been deleted
418418
// from L0, and the added files must all be newer in sequence numbers than
419-
// existing files in l0Sublevels. The files parameter must be sorted in seqnum
420-
// order. The levelMetadata parameter corresponds to the new L0 post addition of
421-
// files. This method is meant to be significantly more performant than
422-
// newL0Sublevels.
419+
// existing files in l0Sublevels. The levelMetadata parameter corresponds to the
420+
// new L0 post addition of files. This method is meant to be significantly more
421+
// performant than newL0Sublevels.
423422
//
424423
// Note that this function can only be called once on a given receiver; it
425424
// appends to some slices in s which is only safe when done once. This is okay,
426425
// as the common case (generating a new l0Sublevels after a flush/ingestion) is
427426
// only going to necessitate one call of this method on a given receiver. The
428427
// returned value, if non-nil, can then have [*l0Sublevels.addL0Files] called on
429428
// it again, and so on. If [errInvalidL0SublevelsOpt] is returned as an error,
430-
// it likely means the optimization could not be applied (i.e. files added were
431-
// older than files already in the sublevels, which is possible around
432-
// ingestions and in tests). Eg. it can happen when an ingested file was
433-
// ingested without queueing a flush since it did not actually overlap with any
434-
// keys in the memtable. Later on the memtable was flushed, and the memtable had
435-
// keys spanning around the ingested file, producing a flushed file that
436-
// overlapped with the ingested file in file bounds but not in keys. It's
437-
// possible for that flushed file to have a lower LargestSeqNum than the
438-
// ingested file if all the additions after the ingestion were to another
439-
// flushed file that was split into a separate sstable during flush. Any other
440-
// non-nil error means [l0Sublevels] generation failed in the same way as
441-
// [newL0Sublevels] would likely fail.
429+
// it means the optimization could not be applied (i.e. files added were older
430+
// than files already in the sublevels, which is possible around ingestions and
431+
// in tests). Eg. it can happen when an ingested file was ingested without
432+
// queueing a flush since it did not actually overlap with any keys in the
433+
// memtable. Later on the memtable was flushed, and the memtable had keys
434+
// spanning around the ingested file, producing a flushed file that overlapped
435+
// with the ingested file in file bounds but not in keys. It's possible for that
436+
// flushed file to have a lower LargestSeqNum than the ingested file if all the
437+
// additions after the ingestion were to another flushed file that was split
438+
// into a separate sstable during flush. Any other non-nil error means
439+
// [l0Sublevels] generation failed in the same way as [newL0Sublevels] would
440+
// likely fail.
442441
func (s *l0Sublevels) addL0Files(
443-
files []*TableMetadata, flushSplitMaxBytes int64, levelMetadata *LevelMetadata,
442+
addedTables map[base.FileNum]*TableMetadata,
443+
flushSplitMaxBytes int64,
444+
levelMetadata *LevelMetadata,
444445
) (*l0Sublevels, error) {
445446
if s.addL0FilesCalled {
446447
if invariants.Enabled {
447448
panic("addL0Files called twice on the same receiver")
448449
}
449450
return nil, errInvalidL0SublevelsOpt
450451
}
451-
if s.levelMetadata.Len()+len(files) != levelMetadata.Len() {
452+
if s.levelMetadata.Len()+len(addedTables) != levelMetadata.Len() {
452453
if invariants.Enabled {
453454
panic("levelMetadata mismatch")
454455
}
455456
return nil, errInvalidL0SublevelsOpt
456457
}
458+
457459
// addL0Files only works when the files we are adding match exactly the last
458-
// files in the new levelMetadata (and the previous s.levelMetadata matches a
459-
// prefix of the new levelMetadata).
460-
{
461-
iterOld, iterNew := s.levelMetadata.Iter(), levelMetadata.Iter()
462-
tOld, tNew := iterOld.First(), iterNew.First()
463-
for i := 0; i < s.levelMetadata.Len(); i++ {
464-
if tOld != tNew {
465-
return nil, errInvalidL0SublevelsOpt
466-
}
467-
tOld, tNew = iterOld.Next(), iterNew.Next()
468-
}
469-
for i := range files {
470-
if files[i] != tNew {
471-
return nil, errInvalidL0SublevelsOpt
472-
}
473-
tNew = iterNew.Next()
460+
// files in the new levelMetadata (this is the case usually, but not always).
461+
files := make([]*TableMetadata, len(addedTables))
462+
iter := levelMetadata.Iter()
463+
t := iter.Last()
464+
for i := len(addedTables) - 1; i >= 0; i-- {
465+
if addedTables[t.FileNum] == nil {
466+
// t is an existing table that sorts after some of the new tables
467+
// (specifically the ones we haven't yet seen).
468+
return nil, errInvalidL0SublevelsOpt
474469
}
470+
files[i] = t
471+
t = iter.Prev()
475472
}
473+
// Note: files now contains addedTables in increasing seqnum order.
476474
s.addL0FilesCalled = true
477475

478476
// Start with a shallow copy of s.
@@ -2077,15 +2075,16 @@ func (s *l0Sublevels) extendCandidateToRectangle(
20772075
// L0Organizer keeps track of L0 state, including the subdivision into
20782076
// sublevels.
20792077
//
2080-
// It is designed to be used as a singleton (per store) which gets updated as
2078+
// It is designed to be used as a singleton (per DB) which gets updated as
20812079
// the version changes. It is used to initialize L0-related Version fields.
20822080
//
20832081
// The level 0 sstables are organized in a series of sublevels. Similar to the
2084-
// seqnum invariant in normal levels, there is no internal key in a higher level
2085-
// table that has both the same user key and a higher sequence number. Within a
2086-
// sublevel, tables are sorted by their internal key range and any two tables at
2087-
// the same sublevel do not overlap. Unlike the normal levels, sublevel n
2088-
// contains older tables (lower sequence numbers) than sublevel n+1.
2082+
// seqnum invariant in normal levels, there is no internal key in a lower
2083+
// sublevel table that has both the same user key and a higher sequence number.
2084+
// Within a sublevel, tables are sorted by their internal key range and any two
2085+
// tables at the same sublevel do not overlap. Unlike the normal levels,
2086+
// sublevel n contains older tables (lower sequence numbers) than sublevel n+1
2087+
// (this is because the number of sublevels is variable).
20892088
type L0Organizer struct {
20902089
cmp base.Compare
20912090
formatKey base.FormatKey
@@ -2125,7 +2124,9 @@ func NewL0Organizer(comparer *base.Comparer, flushSplitBytes int64) *L0Organizer
21252124
return o
21262125
}
21272126

2128-
// SublevelFiles returns the sublevels as LevelSlices.
2127+
// SublevelFiles returns the sublevels as LevelSlices. The returned value (both
2128+
// the slice and each LevelSlice) is immutable. The L0Organizer creates new
2129+
// slices every time L0 changes.
21292130
func (o *L0Organizer) SublevelFiles() []LevelSlice {
21302131
return o.l0Sublevels.Levels
21312132
}
@@ -2146,45 +2147,31 @@ func (o *L0Organizer) Update(
21462147
}
21472148
// If we only added tables, try to use addL0Files.
21482149
if len(deletedL0Tables) == 0 {
2149-
// Construct the file slice needed by addL0Files.
2150-
// TODO(radu): change addL0Files to do this internally.
2151-
files := make([]*TableMetadata, 0, len(addedL0Tables))
2152-
iter := newLevelMeta.Iter()
2153-
for t := iter.Last(); len(files) < len(addedL0Tables); t = iter.Prev() {
2154-
if t == nil || addedL0Tables[t.FileNum] == nil {
2155-
break
2156-
}
2157-
files = append(files, t)
2158-
}
2159-
if len(files) == len(addedL0Tables) {
2160-
slices.Reverse(files)
2161-
newSublevels, err := o.l0Sublevels.addL0Files(files, o.flushSplitBytes, newLevelMeta)
2162-
2163-
if err == nil {
2164-
// In invariants mode, sometimes rebuild from scratch to verify that
2165-
// AddL0Files did the right thing. Note that NewL0Sublevels updates
2166-
// fields in TableMetadata like L0Index, so we don't want to do this
2167-
// every time.
2168-
if invariants.Enabled && invariants.Sometimes(10) {
2169-
expectedSublevels, err := newL0Sublevels(newLevelMeta, o.cmp, o.formatKey, o.flushSplitBytes)
2170-
if err != nil {
2171-
panic(fmt.Sprintf("error when regenerating sublevels: %s", err))
2172-
}
2173-
s1 := describeSublevels(o.formatKey, false /* verbose */, expectedSublevels.Levels)
2174-
s2 := describeSublevels(o.formatKey, false /* verbose */, newSublevels.Levels)
2175-
if s1 != s2 {
2176-
// Add verbosity.
2177-
s1 := describeSublevels(o.formatKey, true /* verbose */, expectedSublevels.Levels)
2178-
s2 := describeSublevels(o.formatKey, true /* verbose */, newSublevels.Levels)
2179-
panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2))
2180-
}
2150+
newSublevels, err := o.l0Sublevels.addL0Files(addedL0Tables, o.flushSplitBytes, newLevelMeta)
2151+
if err == nil {
2152+
// In invariants mode, sometimes rebuild from scratch to verify that
2153+
// AddL0Files did the right thing. Note that NewL0Sublevels updates
2154+
// fields in TableMetadata like L0Index, so we don't want to do this
2155+
// every time.
2156+
if invariants.Enabled && invariants.Sometimes(10) {
2157+
expectedSublevels, err := newL0Sublevels(newLevelMeta, o.cmp, o.formatKey, o.flushSplitBytes)
2158+
if err != nil {
2159+
panic(fmt.Sprintf("error when regenerating sublevels: %s", err))
2160+
}
2161+
s1 := describeSublevels(o.formatKey, false /* verbose */, expectedSublevels.Levels)
2162+
s2 := describeSublevels(o.formatKey, false /* verbose */, newSublevels.Levels)
2163+
if s1 != s2 {
2164+
// Add verbosity.
2165+
s1 := describeSublevels(o.formatKey, true /* verbose */, expectedSublevels.Levels)
2166+
s2 := describeSublevels(o.formatKey, true /* verbose */, newSublevels.Levels)
2167+
panic(fmt.Sprintf("incremental L0 sublevel generation produced different output than regeneration: %s != %s", s1, s2))
21812168
}
2182-
o.l0Sublevels = newSublevels
2183-
return
2184-
}
2185-
if !errors.Is(err, errInvalidL0SublevelsOpt) {
2186-
panic(errors.AssertionFailedf("error generating L0Sublevels: %s", err))
21872169
}
2170+
o.l0Sublevels = newSublevels
2171+
return
2172+
}
2173+
if !errors.Is(err, errInvalidL0SublevelsOpt) {
2174+
panic(errors.AssertionFailedf("error generating L0Sublevels: %s", err))
21882175
}
21892176
}
21902177
var err error
@@ -2194,8 +2181,8 @@ func (o *L0Organizer) Update(
21942181
}
21952182
}
21962183

2197-
// Reset the L0Organizer to reflect a given L0 level. Used for testing.
2198-
func (o *L0Organizer) Reset(levelMetadata *LevelMetadata) {
2184+
// ResetForTesting reinitializes the L0Organizer to reflect a given L0 level.
2185+
func (o *L0Organizer) ResetForTesting(levelMetadata *LevelMetadata) {
21992186
o.levelMetadata = *levelMetadata
22002187
var err error
22012188
o.l0Sublevels, err = newL0Sublevels(levelMetadata, o.cmp, o.formatKey, o.flushSplitBytes)

internal/manifest/l0_sublevels_test.go

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ func TestL0Sublevels(t *testing.T) {
232232
}
233233
explicitSublevels = [][]*TableMetadata{}
234234
sublevel := -1
235-
addedL0Files := make([]*TableMetadata, 0)
235+
addedL0Files := make(map[base.FileNum]*TableMetadata)
236236
for _, data := range strings.Split(td.Input, "\n") {
237237
data = strings.TrimSpace(data)
238238
switch data[:2] {
@@ -260,7 +260,7 @@ func TestL0Sublevels(t *testing.T) {
260260
}
261261
fileMetas[level] = append(fileMetas[level], meta)
262262
if level == 0 {
263-
addedL0Files = append(addedL0Files, meta)
263+
addedL0Files[meta.FileNum] = meta
264264
}
265265
if sublevel != -1 {
266266
for len(explicitSublevels) <= sublevel {
@@ -294,7 +294,6 @@ func TestL0Sublevels(t *testing.T) {
294294
levelMetadata := MakeLevelMetadata(base.DefaultComparer.Compare, 0, fileMetas[0])
295295
if initialize {
296296
if addL0FilesOpt {
297-
SortBySeqNum(addedL0Files)
298297
sublevels, err = sublevels.addL0Files(addedL0Files, int64(flushSplitMaxBytes), &levelMetadata)
299298
// Check if the output matches a full initialization.
300299
sublevels2, _ := newL0Sublevels(&levelMetadata, base.DefaultComparer.Compare, base.DefaultFormatter, int64(flushSplitMaxBytes))
@@ -507,7 +506,7 @@ func TestAddL0FilesEquivalence(t *testing.T) {
507506
// The outer loop runs once for each version edit. The inner loop(s) run
508507
// once for each file, or each file bound.
509508
for i := 0; i < 100; i++ {
510-
var filesToAdd []*TableMetadata
509+
filesToAdd := make(map[base.FileNum]*TableMetadata)
511510
numFiles := 1 + rng.IntN(9)
512511
keys := make([][]byte, 0, 2*numFiles)
513512
for j := 0; j < 2*numFiles; j++ {
@@ -539,7 +538,7 @@ func TestAddL0FilesEquivalence(t *testing.T) {
539538
)
540539
meta.InitPhysicalBacking()
541540
fileMetas = append(fileMetas, meta)
542-
filesToAdd = append(filesToAdd, meta)
541+
filesToAdd[meta.FileNum] = meta
543542
}
544543
if len(filesToAdd) == 0 {
545544
continue
@@ -556,7 +555,6 @@ func TestAddL0FilesEquivalence(t *testing.T) {
556555
// that of the previous L0Sublevels. So it must be called before
557556
// newL0Sublevels; calling it the other way around results in
558557
// out-of-bounds panics.
559-
SortBySeqNum(filesToAdd)
560558
s2, err = s2.addL0Files(filesToAdd, flushSplitMaxBytes, &levelMetadata)
561559
require.NoError(t, err)
562560
}

internal/manifest/version.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1214,7 +1214,7 @@ func NewVersionForTesting(
12141214
v.Levels[l].totalSize += f.Size
12151215
}
12161216
}
1217-
l0Organizer.Reset(&v.Levels[0])
1217+
l0Organizer.ResetForTesting(&v.Levels[0])
12181218
v.L0SublevelFiles = l0Organizer.SublevelFiles()
12191219
return v
12201220
}

internal/manifest/version_edit_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,7 @@ func TestVersionEditApply(t *testing.T) {
461461
// Reinitialize the L0 organizer in case we want to use the same version
462462
// again (l0Organizer now reflects newv).
463463
l0Organizer = NewL0Organizer(base.DefaultComparer, flushSplitBytes)
464-
l0Organizer.Reset(&v.Levels[0])
464+
l0Organizer.ResetForTesting(&v.Levels[0])
465465
l0Organizers[name] = l0Organizer
466466

467467
return newv.DebugString()

0 commit comments

Comments
 (0)