Skip to content

Commit 8875a68

Browse files
committed
db: separate mvcc garbage into blob files
This patch separates SET keys that have already appeared in the sstable into a blob file. Informs: #4424
1 parent e9c5152 commit 8875a68

File tree

11 files changed

+149
-44
lines changed

11 files changed

+149
-44
lines changed

blob_rewrite_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ func TestBlobRewrite(t *testing.T) {
120120
} else {
121121
ikv.V = base.MakeInPlaceValue([]byte(parts[1]))
122122
}
123-
require.NoError(t, vs.Add(tw, &ikv, false /* forceObsolete */))
123+
require.NoError(t, vs.Add(tw, &ikv, false /* forceObsolete */, func() bool { return false } /* isLikeyMVCCGarbage */))
124124
}
125125
return buf.String()
126126
case "close-output":

compaction_test.go

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -908,7 +908,7 @@ func TestCompaction(t *testing.T) {
908908
fmt.Fprintln(&compactionLog, info.String())
909909
},
910910
}
911-
reset := func(minVersion, maxVersion FormatMajorVersion) {
911+
reset := func(minVersion, maxVersion FormatMajorVersion, cmp *Comparer) {
912912
compactionLog.Reset()
913913
if d != nil {
914914
require.NoError(t, closeAllSnapshots(d))
@@ -924,6 +924,7 @@ func TestCompaction(t *testing.T) {
924924
EventListener: compactionLogEventListener,
925925
FormatMajorVersion: randVersion(minVersion, maxVersion),
926926
Logger: testutils.Logger{T: t},
927+
Comparer: cmp,
927928
}
928929
opts.WithFSDefaults()
929930
opts.Experimental.CompactionScheduler = NewConcurrencyLimitSchedulerWithNoPeriodicGrantingForTest()
@@ -971,13 +972,13 @@ func TestCompaction(t *testing.T) {
971972
d.mu.compact.compactingCount--
972973
}
973974

974-
runTest := func(t *testing.T, testData string, minVersion, maxVersion FormatMajorVersion, verbose bool) {
975-
reset(minVersion, maxVersion)
975+
runTest := func(t *testing.T, testData string, minVersion, maxVersion FormatMajorVersion, verbose bool, cmp *Comparer) {
976+
reset(minVersion, maxVersion, cmp)
976977
var ongoingCompaction *tableCompaction
977978
datadriven.RunTest(t, testData, func(t *testing.T, td *datadriven.TestData) string {
978979
switch td.Cmd {
979980
case "reset":
980-
reset(minVersion, maxVersion)
981+
reset(minVersion, maxVersion, cmp)
981982
return ""
982983

983984
case "batch":
@@ -1028,6 +1029,9 @@ func TestCompaction(t *testing.T) {
10281029
DisableAutomaticCompactions: true,
10291030
Logger: testutils.Logger{T: t},
10301031
}
1032+
if cmp != nil {
1033+
opts.Comparer = cmp
1034+
}
10311035
opts.WithFSDefaults()
10321036
opts.Experimental.CompactionScheduler = NewConcurrencyLimitSchedulerWithNoPeriodicGrantingForTest()
10331037
if d != nil {
@@ -1453,6 +1457,7 @@ func TestCompaction(t *testing.T) {
14531457
minVersion FormatMajorVersion // inclusive, FormatMinSupported if unspecified.
14541458
maxVersion FormatMajorVersion // inclusive, internalFormatNewest if unspecified.
14551459
verbose bool
1460+
cmp *Comparer
14561461
}
14571462
testConfigs := map[string]testConfig{
14581463
"singledel_set_with_del": {},
@@ -1483,6 +1488,12 @@ func TestCompaction(t *testing.T) {
14831488
maxVersion: FormatValueSeparation,
14841489
verbose: true,
14851490
},
1491+
"mvcc_garbage_blob": {
1492+
minVersion: FormatValueSeparation,
1493+
maxVersion: FormatValueSeparation,
1494+
verbose: true,
1495+
cmp: testkeys.Comparer,
1496+
},
14861497
"score_compaction_picked_before_manual": {
14871498
// Run at a specific version, so that a single sstable format is used,
14881499
// since the test prints the compaction log which includes file sizes.
@@ -1513,7 +1524,7 @@ func TestCompaction(t *testing.T) {
15131524
if maxVersion == 0 {
15141525
maxVersion = internalFormatNewest
15151526
}
1516-
runTest(t, path, minVersion, maxVersion, tc.verbose)
1527+
runTest(t, path, minVersion, maxVersion, tc.verbose, tc.cmp)
15171528
})
15181529
}
15191530

@@ -1592,14 +1603,14 @@ func TestCompactionDeleteOnlyHints(t *testing.T) {
15921603
// Collection of table stats can trigger compactions. As we want full
15931604
// control over when compactions are run, disable stats by default.
15941605
DisableTableStats: true,
1595-
//EventListener: &EventListener{
1606+
// EventListener: &EventListener{
15961607
// CompactionEnd: func(info CompactionInfo) {
15971608
// if compactInfo != nil {
15981609
// return
15991610
// }
16001611
// compactInfo = &info
16011612
// },
1602-
//},
1613+
// },
16031614
EventListener: &el,
16041615
FormatMajorVersion: internalFormatNewest,
16051616
Logger: testutils.Logger{T: t},

internal/compact/run.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ type ValueSeparation interface {
133133
EstimatedReferenceSize() uint64
134134
// Add adds the provided key-value pair to the provided sstable writer,
135135
// possibly separating the value into a blob file.
136-
Add(tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool) error
136+
Add(tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool, isLikelyMVCCGarbage func() bool) error
137137
// FinishOutput is called when a compaction is finishing an output sstable.
138138
// It returns the table's blob references, which will be added to the
139139
// table's TableMetadata, and stats and metadata describing a newly
@@ -292,8 +292,12 @@ func (r *Runner) writeKeysToTable(
292292
equalPrev := func(k []byte) bool {
293293
return tw.ComparePrev(k) == 0
294294
}
295+
prefixEqual := func(k []byte) bool {
296+
return tw.IsPrefixEqualPrev(k)
297+
}
295298
var pinnedKeySize, pinnedValueSize, pinnedCount uint64
296299
var iteratedKeys uint64
300+
var prevKeyKind base.InternalKeyKind
297301
kv := r.kv
298302
for ; kv != nil; kv = r.iter.Next() {
299303
iteratedKeys++
@@ -331,10 +335,17 @@ func (r *Runner) writeKeysToTable(
331335
}
332336

333337
valueLen := kv.V.Len()
338+
isLikelyMVCCGarbage := func() bool {
339+
return sstable.IsLikelyMVCCGarbage(kv.K.UserKey, prevKeyKind, kv.K.Kind(), valueLen, prefixEqual)
340+
}
341+
prevKeyKind = kv.K.Kind()
334342
// Add the value to the sstable, possibly separating its value into a
335343
// blob file. The ValueSeparation implementation is responsible for
336344
// writing the KV to the sstable.
337-
if err := valueSeparation.Add(tw, kv, r.iter.ForceObsoleteDueToRangeDel()); err != nil {
345+
// If the key might be garbage (all requirements of
346+
// sstable.IsLikelyMVCCGarbage are met), we eagerly separate the value
347+
// into a blob file.
348+
if err := valueSeparation.Add(tw, kv, r.iter.ForceObsoleteDueToRangeDel(), isLikelyMVCCGarbage); err != nil {
338349
return nil, err
339350
}
340351
if r.iter.SnapshotPinned() {
@@ -497,7 +508,7 @@ func (NeverSeparateValues) EstimatedReferenceSize() uint64 { return 0 }
497508

498509
// Add implements the ValueSeparation interface.
499510
func (NeverSeparateValues) Add(
500-
tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool,
511+
tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool, _ func() bool,
501512
) error {
502513
v, _, err := kv.Value(nil)
503514
if err != nil {

sstable/colblk_writer.go

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,19 @@ func (w *RawColumnWriter) ComparePrev(k []byte) int {
256256
return int(w.dataBlock.KeyWriter.ComparePrev(k).UserKeyComparison)
257257
}
258258

259+
// IsPrefixEqualPrev compares the provided user key's prefix to the key
260+
// prefix of the last point key written to the writer.
261+
//
262+
// If no key has been written yet, IsPrefixEqualPrev returns false.
263+
//
264+
// Must not be called after Writer is closed.
265+
func (w *RawColumnWriter) IsPrefixEqualPrev(k []byte) bool {
266+
if w == nil || w.dataBlock.Rows() == 0 {
267+
return false
268+
}
269+
return w.dataBlock.KeyWriter.ComparePrev(k).PrefixEqual()
270+
}
271+
259272
// SetSnapshotPinnedProperties sets the properties for pinned keys. Should only
260273
// be used internally by Pebble.
261274
func (w *RawColumnWriter) SetSnapshotPinnedProperties(
@@ -621,30 +634,18 @@ func (w *RawColumnWriter) evaluatePoint(
621634
key.Pretty(w.comparer.FormatKey))
622635
}
623636

637+
prefixEqual := func(k []byte) bool {
638+
return w.IsPrefixEqualPrev(k)
639+
}
624640
// We might want to write this key's value to a value block if it has the
625641
// same prefix.
626642
//
627643
// We require:
628644
// . Value blocks to be enabled.
629-
// . The current key to have the same prefix as the previous key.
630-
// . The previous key to be a SET.
631-
// . The current key to be a SET.
632-
// . If there are bounds requiring some keys' values to be in-place, the
633-
// key must not fall within those bounds.
634-
// . The value to be sufficiently large. (Currently we simply require a
635-
// non-zero length, so all non-empty values are eligible for storage
636-
// out-of-band in a value block.)
637-
//
638-
// Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of
639-
// valueHandle, this should be > 3. But tiny values are common in test and
640-
// unlikely in production, so we use 0 here for better test coverage.
641-
const tinyValueThreshold = 0
645+
// . IsLikelyMVCCGarbage to be true; see comment for MVCC garbage criteria.
642646
useValueBlock := !w.opts.DisableValueBlocks &&
643-
eval.kcmp.PrefixEqual() &&
644-
prevKeyKind == InternalKeyKindSet &&
645-
keyKind == InternalKeyKindSet &&
646-
valueLen > tinyValueThreshold &&
647-
w.valueBlock != nil
647+
w.valueBlock != nil &&
648+
IsLikelyMVCCGarbage(key.UserKey, prevKeyKind, keyKind, valueLen, prefixEqual)
648649
if !useValueBlock {
649650
return eval, nil
650651
}
@@ -1270,3 +1271,31 @@ func (w *RawColumnWriter) copyProperties(props Properties) {
12701271
w.props.IndexSize = 0
12711272
w.props.IndexType = 0
12721273
}
1274+
1275+
// IsLikelyMVCCGarbage determines whether the given user key is likely MVCC
1276+
// garbage.
1277+
//
1278+
// We require:
1279+
//
1280+
// . The previous key to be a SET.
1281+
// . The current key to be a SET.
1282+
// . The value to be sufficiently large. (Currently we simply require a
1283+
// non-zero length, so all non-empty values are eligible for storage
1284+
// out-of-band in a value block.)
1285+
// . The current key to have the same prefix as the previous key.
1286+
//
1287+
// Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of
1288+
// valueHandle, this should be > 3. But tiny values are common in test and
1289+
// unlikely in production, so we use 0 here for better test coverage.
1290+
func IsLikelyMVCCGarbage(
1291+
k []byte,
1292+
prevKeyKind, keyKind base.InternalKeyKind,
1293+
valueLen int,
1294+
prefixEqual func(k []byte) bool,
1295+
) bool {
1296+
const tinyValueThreshold = 0
1297+
return prevKeyKind == InternalKeyKindSet &&
1298+
keyKind == InternalKeyKindSet &&
1299+
valueLen > tinyValueThreshold &&
1300+
prefixEqual(k)
1301+
}

sstable/rowblk_writer.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ type dataBlockBuf struct {
458458
}
459459

460460
func (d *dataBlockBuf) clear() {
461-
//d.blockBuf.clear()
461+
// d.blockBuf.clear()
462462
d.dataBlock.Reset()
463463

464464
d.uncompressed = nil
@@ -1395,6 +1395,19 @@ func (w *RawRowWriter) ComparePrev(k []byte) int {
13951395
return w.compare(k, w.dataBlockBuf.dataBlock.CurUserKey())
13961396
}
13971397

1398+
// IsPrefixEqualPrev compares the provided user key's prefix to the key
1399+
// prefix of the last point key written to the writer.
1400+
//
1401+
// If no key has been written yet, IsPrefixEqualPrev returns false.
1402+
//
1403+
// Must not be called after Writer is closed.
1404+
func (w *RawRowWriter) IsPrefixEqualPrev(k []byte) bool {
1405+
if w == nil || w.dataBlockBuf.dataBlock.EntryCount() == 0 {
1406+
return false
1407+
}
1408+
return bytes.Equal(w.split.Prefix(k), w.split.Prefix(w.dataBlockBuf.dataBlock.CurUserKey()))
1409+
}
1410+
13981411
// EncodeSpan encodes the keys in the given span. The span can contain either
13991412
// only RANGEDEL keys or only range keys.
14001413
//

sstable/writer.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,9 @@ type RawWriter interface {
337337
// EstimatedSize returns the estimated size of the sstable being written if
338338
// a call to Close() was made without adding additional keys.
339339
EstimatedSize() uint64
340-
// ComparePrev compares the provided user to the last point key written to the
341-
// writer. The returned value is equivalent to Compare(key, prevKey) where
342-
// prevKey is the last point key written to the writer.
340+
// ComparePrev compares the provided user key to the last point key written
341+
// to the writer. The returned value is equivalent to Compare(key, prevKey)
342+
// where prevKey is the last point key written to the writer.
343343
//
344344
// If no key has been written yet, ComparePrev returns +1.
345345
//
@@ -354,6 +354,13 @@ type RawWriter interface {
354354
// Metadata returns the metadata for the finished sstable. Only valid to
355355
// call after the sstable has been finished.
356356
Metadata() (*WriterMetadata, error)
357+
// IsPrefixEqualPrev compares the provided user key's prefix to the key
358+
// prefix of the last point key written to the writer.
359+
//
360+
// If no key has been written yet, IsPrefixEqualPrev returns false.
361+
//
362+
// Must not be called after Writer is closed.
363+
IsPrefixEqualPrev(k []byte) bool
357364

358365
// rewriteSuffixes rewrites the table's data blocks to all contain the
359366
// provided suffix. It's specifically used for the implementation of

testdata/compaction/l0_to_lbase_compaction

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ L6:
2929
000006:[itbr@1#159646,SET-rmbh@1#319226,SET]
3030
000007:[rmbi@1#319227,SET-zzzz@1#475263,SET]
3131

32-
3332
metrics
3433
----
3534
----
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Set the minimum size for a separated value to 5.
2+
3+
define value-separation=(true, 5, 3, 0s, 1.0)
4+
----
5+
6+
batch
7+
set bar bar
8+
set foo foo
9+
set fuzz fuzz
10+
set yaya yaya
11+
----
12+
13+
batch
14+
set yay@3 a
15+
set yay@2 ab
16+
set zoo@3 b
17+
set zoo@2 ba
18+
----
19+
20+
# This flush *should* write a blob file for our MVCC garbage values, containing
21+
# 2 values: "ab" and "ba" - totaling 4 bytes of logical values.
22+
23+
flush
24+
----
25+
L0.0:
26+
000005:[bar#10,SET-zoo@2#17,SET] seqnums:[10-17] points:[bar#10,SET-zoo@2#17,SET] size:834 blobrefs:[(B000006: 4); depth:1]
27+
Blob files:
28+
B000006 physical:{000006 size:[94 (94B)] vals:[4 (4B)]}

testdata/compaction/value_separation

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ set w world
215215
----
216216

217217
# This flush *should* write a blob file, containing 2 values: "hello" and
218-
# "world" totalling 10 bytes of logical values.
218+
# "world" totaling 10 bytes of logical values.
219219

220220
flush
221221
----

value_separation.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,10 @@ func uniqueInputBlobMetadatas(
172172
}
173173

174174
// writeNewBlobFiles implements the strategy and mechanics for separating values
175-
// into external blob files.
175+
// into external blob files. We will always separate potential MVCC garbage
176+
// values into this external blob file. MVCC garbage values are determined on a
177+
// best-effort basis; see comments in sstable.IsLikelyMVCCGarbage for the
178+
// exact criteria we use.
176179
type writeNewBlobFiles struct {
177180
comparer *base.Comparer
178181
// newBlobObject constructs a new blob object for use in the compaction.
@@ -197,7 +200,7 @@ type writeNewBlobFiles struct {
197200
// short attribute extractor returns an error.
198201
invalidValueCallback func(userKey []byte, value []byte, err error)
199202

200-
// Current blob writer state
203+
// Current blob writer state.
201204
writer *blob.FileWriter
202205
objMeta objstorage.ObjectMetadata
203206

@@ -233,7 +236,7 @@ func (vs *writeNewBlobFiles) EstimatedReferenceSize() uint64 {
233236
// Add adds the provided key-value pair to the sstable, possibly separating the
234237
// value into a blob file.
235238
func (vs *writeNewBlobFiles) Add(
236-
tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool,
239+
tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool, isLikelyMVCCGarbage func() bool,
237240
) error {
238241
// We always fetch the value if we're rewriting blob files. We want to
239242
// replace any references to existing blob files with references to new blob
@@ -246,10 +249,14 @@ func (vs *writeNewBlobFiles) Add(
246249
vs.buf = v[:0]
247250
}
248251

249-
// Values that are too small are never separated.
250-
if len(v) < vs.minimumSize {
252+
// Values that are too small are never separated; however, MVCC keys are
253+
// separated if they are a SET key kind, as long as the value is not empty.
254+
//
255+
// TODO(annie): Also allow SetWithDelete keys to be separated.
256+
if len(v) < vs.minimumSize && !isLikelyMVCCGarbage() {
251257
return tw.Add(kv.K, v, forceObsolete)
252258
}
259+
253260
// Merge and deletesized keys are never separated.
254261
switch kv.K.Kind() {
255262
case base.InternalKeyKindMerge, base.InternalKeyKindDeleteSized:
@@ -405,7 +412,7 @@ func (vs *preserveBlobReferences) EstimatedReferenceSize() uint64 {
405412
// Add implements compact.ValueSeparation. This implementation will write
406413
// existing blob references to the output table.
407414
func (vs *preserveBlobReferences) Add(
408-
tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool,
415+
tw sstable.RawWriter, kv *base.InternalKV, forceObsolete bool, _ func() bool,
409416
) error {
410417
if !kv.V.IsBlobValueHandle() {
411418
// If the value is not already a blob handle (either it's in-place or in

0 commit comments

Comments
 (0)