Skip to content

Commit 8498142

Browse files
committed
manifest: don't compact blob files with uncertain outstanding references
When a virtual table is created from an existing sstable, the virtual table inherits the backing sstable's blob references. The BlobReference's ValueSize is computed by linearly interpolating according to the approximate virtual table size. This could be grossly inaccurate. In #5214, we began tracking a virtual table's blob references' backing value size and using this to determine a blob file's eligibility for blob file rewrite compactions. Using the backing's value size is pessimistic, which avoids fruitlessly rewriting a blob file that is actually fully referenced. The BackingValueSize introduced by #5214 is only populated for DBs at FormatBackingValueSize and later. We've observed instances (#5306) where blob file rewrite compactions repeatedly rewrite the same blob file due to this mismatch. This problem was exacerbated by the introduction of high-priority blob file rewrite compactions (#5258). This commit fixes this issue by considering ineligible for rewrite any blob files with outstanding references from virtual tables that don't have a BackingValueSize. Informs #5306.
1 parent 2309bb8 commit 8498142

File tree

2 files changed

+40
-26
lines changed

2 files changed

+40
-26
lines changed

internal/manifest/blob_metadata.go

Lines changed: 36 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,18 @@ type currentBlobFile struct {
544544
// since virtual sstables contribute the referenced value size of the original
545545
// backing table.
546546
referencedValueSize uint64
547+
// virtualReferencesWithoutBackingValueSize is the count of virtual tables
548+
// referencing this blob file that do not have a BackingValueSize. The
549+
// BackingValueSize is only populated for format major version
550+
// FormatBackingValueSize and beyond. Virtual tables created before this
551+
// format major version have no accounting of the backing table's references
552+
// to this blob file, so the referencedValueSize is unreliable.
553+
//
554+
// If virtualReferencesWithoutBackingValueSize is positive, the blob file is
555+
// considered ineligible for blob rewrite compactions, because we cannot be
556+
// sure that rewriting the blob file will actually allow us to elide any
557+
// values.
558+
virtualReferencesWithoutBackingValueSize uint64
547559
// heapState holds a pointer to the heap that the blob file is in (if any)
548560
// and its index in the heap's items slice. If referencedValueSize is less
549561
// than metadata.Physical.ValueSize, the blob file belongs in one of the two
@@ -556,6 +568,16 @@ type currentBlobFile struct {
556568
}
557569
}
558570

571+
func (cbf *currentBlobFile) isEligibleForRewrite() bool {
572+
// The blob file must not be fully referenced. Additionally it must not have
573+
// an outstanding reference from a virtual table without a BackingValueSize,
574+
// which would cause the referencedValueSize to be unreliable (and we would
575+
// not be able to guarantee rewriting the blob file would actually elide any
576+
// values).
577+
return cbf.referencedValueSize < cbf.metadata.Physical.ValueSize &&
578+
cbf.virtualReferencesWithoutBackingValueSize == 0
579+
}
580+
559581
// BlobRewriteHeuristic configures the heuristic used to determine which blob
560582
// files should be rewritten and replaced in order to reduce value-separation
561583
// induced space amplification.
@@ -642,12 +664,13 @@ func (s *CurrentBlobFileSet) Init(bve *BulkVersionEdit, h BlobRewriteHeuristic)
642664
panic(errors.AssertionFailedf("pebble: referenced blob file %d not found", ref.FileID))
643665
}
644666
cbf.references[table] = struct{}{}
645-
refSize := ref.BackingValueSize
646-
if refSize == 0 {
647-
refSize = ref.ValueSize
667+
refSize := max(ref.BackingValueSize, ref.ValueSize)
668+
if table.Virtual && ref.BackingValueSize == 0 {
669+
cbf.virtualReferencesWithoutBackingValueSize++
648670
}
649671
cbf.referencedValueSize += refSize
650672
s.stats.ReferencedValueSize += ref.ValueSize
673+
s.stats.ReferencedBackingValueSize += refSize
651674
s.stats.ReferencesCount++
652675
}
653676
}
@@ -663,9 +686,7 @@ func (s *CurrentBlobFileSet) Init(bve *BulkVersionEdit, h BlobRewriteHeuristic)
663686
s.rewrite.candidates.items = make([]*currentBlobFile, 0, 16)
664687
s.rewrite.recentlyCreated.items = make([]*currentBlobFile, 0, 16)
665688
for _, cbf := range s.files {
666-
if cbf.referencedValueSize >= cbf.metadata.Physical.ValueSize {
667-
// The blob file is fully referenced. There's no need to track it in
668-
// either heap.
689+
if !cbf.isEligibleForRewrite() {
669690
continue
670691
}
671692

@@ -771,7 +792,7 @@ func (s *CurrentBlobFileSet) ApplyAndUpdateVersionEdit(ve *VersionEdit) error {
771792
// referenced. Additional references could have been removed while
772793
// the blob file rewrite was occurring. We need to re-add it to the
773794
// heap of recently created files.
774-
if cbf.referencedValueSize < cbf.metadata.Physical.ValueSize {
795+
if cbf.isEligibleForRewrite() {
775796
heap.Push(&s.rewrite.recentlyCreated, cbf)
776797
}
777798
continue
@@ -797,10 +818,9 @@ func (s *CurrentBlobFileSet) ApplyAndUpdateVersionEdit(ve *VersionEdit) error {
797818
return errors.AssertionFailedf("pebble: referenced blob file %d not found", ref.FileID)
798819
}
799820
cbf.references[e.Meta] = struct{}{}
800-
refSize := ref.BackingValueSize
801-
if refSize == 0 {
802-
// Fall back to using ValueSize to estimate reference size.
803-
refSize = ref.ValueSize
821+
refSize := max(ref.BackingValueSize, ref.ValueSize)
822+
if e.Meta.Virtual && ref.BackingValueSize == 0 {
823+
cbf.virtualReferencesWithoutBackingValueSize++
804824
}
805825
cbf.referencedValueSize += refSize
806826
s.stats.ReferencedValueSize += ref.ValueSize
@@ -830,10 +850,9 @@ func (s *CurrentBlobFileSet) ApplyAndUpdateVersionEdit(ve *VersionEdit) error {
830850
// Decrement the stats for this reference. We decrement even if the
831851
// table is being moved, because we incremented the stats when we
832852
// iterated over the version edit's new tables.
833-
refSize := ref.BackingValueSize
834-
if refSize == 0 {
835-
// Fall back to using ValueSize to estimate reference size.
836-
refSize = ref.ValueSize
853+
refSize := max(ref.BackingValueSize, ref.ValueSize)
854+
if meta.Virtual && ref.BackingValueSize == 0 {
855+
cbf.virtualReferencesWithoutBackingValueSize--
837856
}
838857
cbf.referencedValueSize -= refSize
839858
s.stats.ReferencedBackingValueSize -= refSize
@@ -880,8 +899,8 @@ func (s *CurrentBlobFileSet) ApplyAndUpdateVersionEdit(ve *VersionEdit) error {
880899
continue
881900
}
882901

883-
if cbf.referencedValueSize >= cbf.metadata.Physical.ValueSize {
884-
// This blob file is fully referenced.
902+
if !cbf.isEligibleForRewrite() {
903+
// This blob file is not eligible for rewrite.
885904
continue
886905
}
887906

testdata/compaction/value_separation

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -490,18 +490,13 @@ L6:
490490
Blob files:
491491
B000006 physical:{000006 size:[109 (109B)] vals:[18 (18B)]}
492492

493-
# Run a blob-rewrite compaction. It'll rewrite the blob file, but it won't
494-
# actually reclaim disk space. This is a known limitation due to the lack of
495-
# accurate blob value liveness for virtual sstables. See
496-
# https://github.com/cockroachdb/pebble/issues/4915.
493+
# Attempt to run a blob-rewrite compaction. We won't be able to guarantee that a
494+
# rewrite would reclaim disk space because of the virtual table's reference
495+
# (also, in reality it wouldn't), so we won't schedule the compaction.
497496

498497
run-blob-rewrite-compaction
499498
----
500-
L6:
501-
000007(000005):[a#10,SET-a#10,SET] seqnums:[10-12] points:[a#10,SET-a#10,SET] size:104(767) blobrefs:[(B000006: 2); depth:1]
502-
000008(000005):[c#12,SET-c#12,SET] seqnums:[10-12] points:[c#12,SET-c#12,SET] size:104(767) blobrefs:[(B000006: 2); depth:1]
503-
Blob files:
504-
B000006 physical:{000009 size:[110 (110B)] vals:[18 (18B)]}
499+
no blob file rewrite compaction
505500

506501
validate-blob-reference-index-block
507502
000007.sst

0 commit comments

Comments
 (0)