db: compactions: calculate eventual output level

RaduBerinde · RaduBerinde · commit 97df4e111a01 · 2025-11-05T08:48:38.000-08:00
We improve the compaction code to check if the levels below the
compaction bounds are empty, in which case we adjust the output file
size and other sstable writer options to correspond to the eventual
level (after move compactions).
diff --git a/compaction.go b/compaction.go
@@ -270,6 +270,15 @@ type tableCompaction struct {
 
 	inputs []compactionLevel
 
+	// eventualOutputLevel is normally outputLevel.level, unless
+	// outputLevel.level+1 has no overlap with the compaction bounds (in which
+	// case it is the bottom-most consecutive level with no such overlap).
+	//
+	// Because of move compactions, we know that any sstables produced by this
+	// compaction will be later moved to eventualOutputLevel. So we use
+	// eventualOutputLevel when determining the target file size, compression
+	// options, etc.
+	eventualOutputLevel int
 	// maxOutputFileSize is the maximum size of an individual table created
 	// during compaction.
 	maxOutputFileSize uint64
@@ -280,9 +289,10 @@ type tableCompaction struct {
 	// The boundaries of the input data.
 	bounds base.UserKeyBounds
 
-	// grandparents are the tables in level+2 that overlap with the files being
-	// compacted. Used to determine output table boundaries. Do not assume that the actual files
-	// in the grandparent when this compaction finishes will be the same.
+	// grandparents are the tables in eventualOutputLevel+2 that overlap with the
+	// files being compacted. Used to determine output table boundaries. Do not
+	// assume that the actual files in the grandparent when this compaction
+	// finishes will be the same.
 	grandparents manifest.LevelSlice
 
 	delElision      compact.TombstoneElision
@@ -572,7 +582,18 @@ func newCompaction(
 		grantHandle: grantHandle,
 	}
 
-	targetFileSize := opts.TargetFileSize(pc.outputLevel.level, pc.baseLevel)
+	// Determine eventual output level.
+	c.eventualOutputLevel = pc.outputLevel.level
+	// TODO(radu): for intra-L0 compactions, we could check if the compaction
+	// includes all L0 files within the bounds.
+	if pc.outputLevel.level != 0 {
+		for c.eventualOutputLevel < manifest.NumLevels-1 && !c.version.HasOverlap(c.eventualOutputLevel+1, c.bounds) {
+			// All output tables are guaranteed to be moved down.
+			c.eventualOutputLevel++
+		}
+	}
+
+	targetFileSize := opts.TargetFileSize(c.eventualOutputLevel, pc.baseLevel)
 	c.maxOutputFileSize = uint64(targetFileSize)
 	c.maxOverlapBytes = maxGrandparentOverlapBytes(targetFileSize)
 
@@ -607,8 +628,8 @@ func newCompaction(
 	}
 	// Compute the set of outputLevel+1 files that overlap this compaction (these
 	// are the grandparent sstables).
-	if c.outputLevel.level+1 < numLevels {
-		c.grandparents = c.version.Overlaps(max(c.outputLevel.level+1, pc.baseLevel), c.bounds)
+	if c.eventualOutputLevel < manifest.NumLevels-1 {
+		c.grandparents = c.version.Overlaps(max(c.eventualOutputLevel+1, pc.baseLevel), c.bounds)
 	}
 	c.delElision, c.rangeKeyElision = compact.SetupTombstoneElision(
 		c.comparer.Compare, c.version, pc.l0Organizer, c.outputLevel.level, c.bounds,
@@ -666,7 +687,10 @@ func (c *tableCompaction) maybeSwitchToMoveOrCopy(
 	// We avoid a move or copy if there is lots of overlapping grandparent data.
 	// Otherwise, the move could create a parent file that will require a very
 	// expensive merge later on.
-	if c.grandparents.AggregateSizeSum() > c.maxOverlapBytes {
+	//
+	// Note that if eventualOutputLevel != outputLevel, there are no
+	// "grandparents" on the output level.
+	if c.eventualOutputLevel == c.outputLevel.level && c.grandparents.AggregateSizeSum() > c.maxOverlapBytes {
 		return
 	}
 
@@ -840,13 +864,18 @@ func newFlush(
 		logger:             opts.Logger,
 		inputs:             []compactionLevel{{level: -1}, {level: 0}},
 		getValueSeparation: getValueSeparation,
-		maxOutputFileSize:  math.MaxUint64,
-		maxOverlapBytes:    math.MaxUint64,
-		grantHandle:        noopGrantHandle{},
+		// TODO(radu): consider calculating the eventual output level for flushes.
+		// We expect the bounds to be very wide in practice, but perhaps we can do a
+		// finer-grained overlap analysis.
+		eventualOutputLevel: 0,
+		maxOutputFileSize:   math.MaxUint64,
+		maxOverlapBytes:     math.MaxUint64,
+		grantHandle:         noopGrantHandle{},
 		metrics: compactionMetrics{
 			beganAt: beganAt,
 		},
 	}
+
 	c.flush.flushables = flushing
 	c.flush.l0Limits = l0Organizer.FlushSplitKeys()
 	c.startLevel = &c.inputs[0]
@@ -3440,7 +3469,7 @@ func (d *DB) compactAndWrite(
 			}
 			spanPolicyValid = true
 		}
-		writerOpts := d.makeWriterOptions(c.outputLevel.level)
+		writerOpts := d.makeWriterOptions(c.eventualOutputLevel)
 		if spanPolicy.ValueStoragePolicy.DisableSeparationBySuffix {
 			writerOpts.DisableValueBlocks = true
 		}
diff --git a/compaction_picker_test.go b/compaction_picker_test.go
@@ -702,13 +702,13 @@ func TestCompactionPickerL0(t *testing.T) {
 			if ptc == nil {
 				return "no compaction"
 			}
-			c := newCompaction(ptc, opts, time.Now(), nil /* provider */, noopGrantHandle{}, sstable.TableFormatMinSupported, neverSeparateValues)
+			c := newCompaction(ptc, opts, time.Now(), nil /* provider */, noopGrantHandle{}, noSharedStorage, neverSeparateValues)
 			return fmt.Sprintf("%d", c.maxOutputFileSize)
 		case "max-overlap-bytes":
 			if ptc == nil {
 				return "no compaction"
 			}
-			c := newCompaction(ptc, opts, time.Now(), nil /* provider */, noopGrantHandle{}, sstable.TableFormatMinSupported, neverSeparateValues)
+			c := newCompaction(ptc, opts, time.Now(), nil /* provider */, noopGrantHandle{}, noSharedStorage, neverSeparateValues)
 			return fmt.Sprintf("%d", c.maxOverlapBytes)
 		}
 		return fmt.Sprintf("unrecognized command: %s", td.Cmd)
@@ -1269,7 +1269,7 @@ func TestCompactionOutputFileSize(t *testing.T) {
 				ptc := pc.(*pickedTableCompaction)
 				fmt.Fprintf(&buf, "L%d -> L%d\n", ptc.startLevel.level, ptc.outputLevel.level)
 				fmt.Fprintf(&buf, "L%d: %s\n", ptc.startLevel.level, tableNums(ptc.startLevel.files))
-				c := newCompaction(ptc, opts, time.Now(), nil /* provider */, noopGrantHandle{}, sstable.TableFormatMinSupported, neverSeparateValues)
+				c := newCompaction(ptc, opts, time.Now(), nil /* provider */, noopGrantHandle{}, noSharedStorage, neverSeparateValues)
 				fmt.Fprintf(&buf, "maxOutputFileSize: %d\n", c.maxOutputFileSize)
 			} else {
 				return "nil"
diff --git a/data_test.go b/data_test.go
@@ -1429,31 +1429,35 @@ func runLayoutCmd(t *testing.T, td *datadriven.TestData, d *DB) string {
 func runPopulateCmd(t *testing.T, td *datadriven.TestData, b *Batch) {
 	var maxKeyLength, valLength int
 	var timestamps []int
+	var prefix string
 	td.ScanArgs(t, "keylen", &maxKeyLength)
 	td.MaybeScanArgs(t, "timestamps", &timestamps)
 	td.MaybeScanArgs(t, "vallen", &valLength)
+	td.MaybeScanArgs(t, "prefix", &prefix)
 	// Default to writing timestamps @1.
 	if len(timestamps) == 0 {
 		timestamps = append(timestamps, 1)
 	}
 
 	ks := testkeys.Alpha(maxKeyLength)
-	buf := make([]byte, ks.MaxLen()+testkeys.MaxSuffixLen)
+	buf := make([]byte, len(prefix)+ks.MaxLen()+testkeys.MaxSuffixLen)
+	copy(buf, prefix)
 	vbuf := make([]byte, valLength)
 	for i := uint64(0); i < ks.Count(); i++ {
 		for _, ts := range timestamps {
-			n := testkeys.WriteKeyAt(buf, ks, i, int64(ts))
+			n := testkeys.WriteKeyAt(buf[len(prefix):], ks, i, int64(ts))
+			key := buf[:len(prefix)+n]
 
 			// Default to using the key as the value, but if the user provided
 			// the vallen argument, generate a random value of the specified
 			// length.
-			value := buf[:n]
+			value := key
 			if valLength > 0 {
 				_, err := crand.Read(vbuf)
 				require.NoError(t, err)
 				value = vbuf
 			}
-			require.NoError(t, b.Set(buf[:n], value, nil))
+			require.NoError(t, b.Set(key, value, nil))
 		}
 	}
 }
diff --git a/internal/manifest/level_metadata.go b/internal/manifest/level_metadata.go
@@ -415,6 +415,13 @@ func (ls LevelSlice) Overlaps(cmp Compare, bounds base.UserKeyBounds) LevelSlice
 	return newBoundedLevelSlice(startIter.iter.clone(), &startIter.iter, &endIter.iter)
 }
 
+// HasOverlap is equivalent to ls.Overlaps(cmp, bounds).Len() > 0 but is more efficient.
+func (ls LevelSlice) HasOverlap(cmp Compare, bounds base.UserKeyBounds) bool {
+	iter := ls.Iter()
+	t := iter.SeekGE(cmp, bounds.Start)
+	return t != nil && bounds.End.IsUpperBoundFor(cmp, t.Smallest().UserKey)
+}
+
 // KeyType is used to specify the type of keys we're looking for in
 // LevelIterator positioning operations. Files not containing any keys of the
 // desired type are skipped.
diff --git a/internal/manifest/version.go b/internal/manifest/version.go
@@ -582,6 +582,20 @@ func (v *Version) Overlaps(level int, bounds base.UserKeyBounds) LevelSlice {
 	return v.Levels[level].Slice().Overlaps(v.cmp.Compare, bounds)
 }
 
+// HasOverlap is equivalent to len(v.Overlaps(level, bounds)) > 0 but more
+// efficient.
+func (v *Version) HasOverlap(level int, bounds base.UserKeyBounds) bool {
+	if level == 0 {
+		for sublevel := range v.L0SublevelFiles {
+			if v.L0SublevelFiles[sublevel].HasOverlap(v.cmp.Compare, bounds) {
+				return true
+			}
+		}
+		return false
+	}
+	return v.Levels[level].Slice().HasOverlap(v.cmp.Compare, bounds)
+}
+
 // AllLevelsAndSublevels returns an iterator that produces a Layer, LevelSlice
 // pair for each L0 sublevel (from top to bottom) and each level below L0.
 func (v *Version) AllLevelsAndSublevels() iter.Seq2[Layer, LevelSlice] {
diff --git a/testdata/compaction/l0_to_lbase_compaction b/testdata/compaction/l0_to_lbase_compaction
@@ -119,3 +119,46 @@ DELETE PACER   |   in queue   |   deleted
    other files |    0 (0B)    |    0 (0B)
 ----
 ----
+
+define target-file-sizes=(1, 1, 1, 1000) hide-size
+L0
+  ba#2,SET:v
+L0
+  bb#2,SET:v
+L0
+  ba#1,SET:v
+L0
+  bb#1,SET:v
+L4
+  a#3,SET:v
+L5
+  a#2,SET:v
+L6
+  a#1,SET:v
+----
+L0.1:
+  000004:[ba#2,SET-ba#2,SET]
+  000005:[bb#2,SET-bb#2,SET]
+L0.0:
+  000006:[ba#1,SET-ba#1,SET]
+  000007:[bb#1,SET-bb#1,SET]
+L4:
+  000008:[a#3,SET-a#3,SET]
+L5:
+  000009:[a#2,SET-a#2,SET]
+L6:
+  000010:[a#1,SET-a#1,SET]
+
+# The result of this compaction requires two files on all levels except L6. Even
+# though we are compacting into L1, we are using the L6 target file size,
+# knowing that the result will eventually be moved.
+compact b-c
+----
+L1:
+  000011:[ba#0,SET-bb#0,SET]
+L4:
+  000008:[a#3,SET-a#3,SET]
+L5:
+  000009:[a#2,SET-a#2,SET]
+L6:
+  000010:[a#1,SET-a#1,SET]