Skip to content

Commit 2c57303

Browse files
committed
db: maintain a Version reference during compactions
Today, a compaction has an associated version (c.version) that is used sparingly in a few compaction edge cases. This version is never referenced and may become unreferenced and cleared while a compaction is running. This has generally been okay, because compaction picking must explicitly coordinate with running compactions so that sstables that are inputs to compactions aren't inputs to some other compactions. With the introduction of blob files (#112) and blob file rewriting, a compaction may need to use the Version's BlobFileSet to lookup the mapping from a BlobFileID to a physical disk file number. To ensure the BlobFileSet is still valid to read, the Version needs to remain referenced. This version ref-ing also allows us to remove some existing, subtle code that would reference an arbitrary latest version when a compaction began so that excises could read sstables without their removal.
1 parent 8e93f83 commit 2c57303

File tree

1 file changed

+96
-61
lines changed

1 file changed

+96
-61
lines changed

compaction.go

Lines changed: 96 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -369,6 +369,10 @@ func (c *compaction) userKeyBounds() base.UserKeyBounds {
369369

370370
type getValueSeparation func(JobID, *compaction, sstable.TableFormat) compact.ValueSeparation
371371

372+
// newCompaction constructs a compaction from the provided picked compaction.
373+
//
374+
// The compaction is created with a reference to its version that must be
375+
// released when the compaction is complete.
372376
func newCompaction(
373377
pc *pickedCompaction,
374378
opts *Options,
@@ -397,6 +401,21 @@ func newCompaction(
397401
grantHandle: grantHandle,
398402
tableFormat: tableFormat,
399403
}
404+
// Acquire a reference to the version to ensure that files and in-memory
405+
// version state necessary for reading files remain available. Ignoring
406+
// excises, this isn't strictly necessary for reading the sstables that are
407+
// inputs to the compaction because those files are 'marked as compacting'
408+
// and shouldn't be subject to any competing compactions. However with
409+
// excises, a concurrent excise may remove a compaction's file from the
410+
// Version and then cancel the compaction. The file shouldn't be physically
411+
// removed until the cancelled compaction stops reading it.
412+
//
413+
// Additionally, we need any blob files referenced by input sstables to
414+
// remain available, even if the blob file is rewritten. Maintaining a
415+
// reference ensures that all these files remain available for the
416+
// compaction's reads.
417+
c.version.Ref()
418+
400419
c.startLevel = &c.inputs[0]
401420
if pc.startLevel.l0SublevelInfo != nil {
402421
c.startLevel.l0SublevelInfo = pc.startLevel.l0SublevelInfo
@@ -501,6 +520,11 @@ func (c *compaction) maybeSwitchToMoveOrCopy(
501520
}
502521
}
503522

523+
// newDeleteOnlyCompaction constructs a delete-only compaction from the provided
524+
// inputs.
525+
//
526+
// The compaction is created with a reference to its version that must be
527+
// released when the compaction is complete.
504528
func newDeleteOnlyCompaction(
505529
opts *Options,
506530
cur *manifest.Version,
@@ -523,6 +547,20 @@ func newDeleteOnlyCompaction(
523547
exciseEnabled: exciseEnabled,
524548
grantHandle: noopGrantHandle{},
525549
}
550+
// Acquire a reference to the version to ensure that files and in-memory
551+
// version state necessary for reading files remain available. Ignoring
552+
// excises, this isn't strictly necessary for reading the sstables that are
553+
// inputs to the compaction because those files are 'marked as compacting'
554+
// and shouldn't be subject to any competing compactions. However with
555+
// excises, a concurrent excise may remove a compaction's file from the
556+
// Version and then cancel the compaction. The file shouldn't be physically
557+
// removed until the cancelled compaction stops reading it.
558+
//
559+
// Additionally, we need any blob files referenced by input sstables to
560+
// remain available, even if the blob file is rewritten. Maintaining a
561+
// reference ensures that all these files remain available for the
562+
// compaction's reads.
563+
c.version.Ref()
526564

527565
// Set c.smallest, c.largest.
528566
files := make([]iter.Seq[*manifest.TableMetadata], 0, len(inputs))
@@ -600,6 +638,17 @@ func adjustGrandparentOverlapBytesForFlush(c *compaction, flushingBytes uint64)
600638
}
601639
}
602640

641+
// newFlush creates the state necessary for a flush (modeled with the compaction
642+
// struct).
643+
//
644+
// newFlush takes the current Version in order to populate grandparent flushing
645+
// limits, but it does not reference the version.
646+
//
647+
// TODO(jackson): Consider maintaining a reference to the version anyways since
648+
// in the future in-memory Version state may only be available while a Version
649+
// is referenced (eg, if we start recycling B-Tree nodes once they're no longer
650+
// referenced). There's subtlety around unref'ing the version at the right
651+
// moment, so we defer it for now.
603652
func newFlush(
604653
opts *Options,
605654
cur *manifest.Version,
@@ -1341,7 +1390,7 @@ func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) {
13411390

13421391
// Finding the target level for ingestion must use the latest version
13431392
// after the logLock has been acquired.
1344-
c.version = d.mu.versions.currentVersion()
1393+
version := d.mu.versions.currentVersion()
13451394

13461395
baseLevel := d.mu.versions.picker.getBaseLevel()
13471396
ve := &manifest.VersionEdit{}
@@ -1380,7 +1429,7 @@ func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) {
13801429
logger: d.opts.Logger,
13811430
Category: categoryIngest,
13821431
},
1383-
v: c.version,
1432+
v: version,
13841433
}
13851434
replacedTables := make(map[base.TableNum][]manifest.NewTableEntry)
13861435
for _, file := range ingestFlushable.files {
@@ -1426,7 +1475,7 @@ func (d *DB) runIngestFlush(c *compaction) (*manifest.VersionEdit, error) {
14261475
if ingestFlushable.exciseSpan.Valid() {
14271476
exciseBounds := ingestFlushable.exciseSpan.UserKeyBounds()
14281477
// Iterate through all levels and find files that intersect with exciseSpan.
1429-
for layer, ls := range c.version.AllLevelsAndSublevels() {
1478+
for layer, ls := range version.AllLevelsAndSublevels() {
14301479
for m := range ls.Overlaps(d.cmp, ingestFlushable.exciseSpan.UserKeyBounds()).All() {
14311480
leftTable, rightTable, err := d.exciseTable(context.TODO(), exciseBounds, m, layer.Level(), tightExciseBounds)
14321481
if err != nil {
@@ -2406,23 +2455,43 @@ func (d *DB) compactionPprofLabels(c *compaction) pprof.LabelSet {
24062455
// compact runs one compaction and maybe schedules another call to compact.
24072456
func (d *DB) compact(c *compaction, errChannel chan error) {
24082457
pprof.Do(context.Background(), d.compactionPprofLabels(c), func(context.Context) {
2409-
d.mu.Lock()
2410-
c.grantHandle.Started()
2411-
if err := d.compact1(c, errChannel); err != nil {
2412-
d.handleCompactFailure(c, err)
2413-
}
2414-
if c.isDownload {
2415-
d.mu.compact.downloadingCount--
2416-
} else {
2417-
d.mu.compact.compactingCount--
2418-
}
2419-
delete(d.mu.compact.inProgress, c)
2420-
// Add this compaction's duration to the cumulative duration. NB: This
2421-
// must be atomic with the above removal of c from
2422-
// d.mu.compact.InProgress to ensure Metrics.Compact.Duration does not
2423-
// miss or double count a completing compaction's duration.
2424-
d.mu.compact.duration += d.timeNow().Sub(c.beganAt)
2425-
d.mu.Unlock()
2458+
func() {
2459+
d.mu.Lock()
2460+
defer d.mu.Unlock()
2461+
jobID := d.newJobIDLocked()
2462+
2463+
c.grantHandle.Started()
2464+
compactErr := d.compact1(jobID, c)
2465+
// The version stored in the compaction is ref'd when the
2466+
// compaction is created. We're responsible for un-refing it
2467+
// when the compaction is complete.
2468+
//
2469+
// Unreferencing the version may have accumulated obsolete
2470+
// files, so we schedule a deletion of obsolete files.
2471+
c.version.UnrefLocked()
2472+
d.deleteObsoleteFiles(jobID)
2473+
// We send on the error channel only after we've deleted
2474+
// obsolete files so that tests performing manual compactions
2475+
// block until the obsolete files are deleted, and the test
2476+
// observes the deletion.
2477+
if errChannel != nil {
2478+
errChannel <- compactErr
2479+
}
2480+
if compactErr != nil {
2481+
d.handleCompactFailure(c, compactErr)
2482+
}
2483+
if c.isDownload {
2484+
d.mu.compact.downloadingCount--
2485+
} else {
2486+
d.mu.compact.compactingCount--
2487+
}
2488+
delete(d.mu.compact.inProgress, c)
2489+
// Add this compaction's duration to the cumulative duration. NB: This
2490+
// must be atomic with the above removal of c from
2491+
// d.mu.compact.InProgress to ensure Metrics.Compact.Duration does not
2492+
// miss or double count a completing compaction's duration.
2493+
d.mu.compact.duration += d.timeNow().Sub(c.beganAt)
2494+
}()
24262495
// Done must not be called while holding any lock that needs to be
24272496
// acquired by Schedule. Also, it must be called after new Version has
24282497
// been installed, and metadata related to compactingCount and inProgress
@@ -2440,10 +2509,12 @@ func (d *DB) compact(c *compaction, errChannel chan error) {
24402509
// scheduled, so we also need to call maybeScheduleCompaction. And
24412510
// maybeScheduleCompaction encompasses all compactions, and not only those
24422511
// scheduled via the CompactionScheduler.
2443-
d.mu.Lock()
2444-
d.maybeScheduleCompaction()
2445-
d.mu.compact.cond.Broadcast()
2446-
d.mu.Unlock()
2512+
func() {
2513+
d.mu.Lock()
2514+
defer d.mu.Unlock()
2515+
d.maybeScheduleCompaction()
2516+
d.mu.compact.cond.Broadcast()
2517+
}()
24472518
})
24482519
}
24492520

@@ -2538,14 +2609,7 @@ func (d *DB) cleanupVersionEdit(ve *manifest.VersionEdit) {
25382609
//
25392610
// d.mu must be held when calling this, but the mutex may be dropped and
25402611
// re-acquired during the course of this method.
2541-
func (d *DB) compact1(c *compaction, errChannel chan error) (err error) {
2542-
if errChannel != nil {
2543-
defer func() {
2544-
errChannel <- err
2545-
}()
2546-
}
2547-
2548-
jobID := d.newJobIDLocked()
2612+
func (d *DB) compact1(jobID JobID, c *compaction) (err error) {
25492613
info := c.makeInfo(jobID)
25502614
d.opts.EventListener.CompactionBegin(info)
25512615
startTime := d.timeNow()
@@ -2610,7 +2674,6 @@ func (d *DB) compact1(c *compaction, errChannel chan error) (err error) {
26102674
d.updateReadStateLocked(d.opts.DebugCheck)
26112675
d.updateTableStatsLocked(ve.NewTables)
26122676
}
2613-
d.deleteObsoleteFiles(jobID)
26142677

26152678
return err
26162679
}
@@ -2689,13 +2752,6 @@ func (d *DB) runCopyCompaction(
26892752
newMeta.InitPhysicalBacking()
26902753
}
26912754

2692-
// Before dropping the db mutex, grab a ref to the current version. This
2693-
// prevents any concurrent excises from deleting files that this compaction
2694-
// needs to read/maintain a reference to.
2695-
vers := d.mu.versions.currentVersion()
2696-
vers.Ref()
2697-
defer vers.UnrefLocked()
2698-
26992755
// NB: The order here is reversed, lock after unlock. This is similar to
27002756
// runCompaction.
27012757
d.mu.Unlock()
@@ -3051,15 +3107,6 @@ func (d *DB) runCompaction(
30513107
}
30523108
switch c.kind {
30533109
case compactionKindDeleteOnly:
3054-
// Before dropping the db mutex, grab a ref to the current version. This
3055-
// prevents any concurrent excises from deleting files that this compaction
3056-
// needs to read/maintain a reference to.
3057-
//
3058-
// Note that delete-only compactions can call excise(), which needs to be able
3059-
// to read these files.
3060-
vers := d.mu.versions.currentVersion()
3061-
vers.Ref()
3062-
defer vers.UnrefLocked()
30633110
// Release the d.mu lock while doing I/O.
30643111
// Note the unusual order: Unlock and then Lock.
30653112
snapshots := d.mu.snapshots.toSlice()
@@ -3076,18 +3123,6 @@ func (d *DB) runCompaction(
30763123

30773124
snapshots := d.mu.snapshots.toSlice()
30783125

3079-
if c.flushing == nil {
3080-
// Before dropping the db mutex, grab a ref to the current version. This
3081-
// prevents any concurrent excises from deleting files that this compaction
3082-
// needs to read/maintain a reference to.
3083-
//
3084-
// Note that unlike user iterators, compactionIter does not maintain a ref
3085-
// of the version or read state.
3086-
vers := d.mu.versions.currentVersion()
3087-
vers.Ref()
3088-
defer vers.UnrefLocked()
3089-
}
3090-
30913126
// Release the d.mu lock while doing I/O.
30923127
// Note the unusual order: Unlock and then Lock.
30933128
d.mu.Unlock()

0 commit comments

Comments
 (0)