Skip to content

Commit 01636b7

Browse files
committed
sstable: allow SetWithDelete keys to be considered as MVCC garbage
This patch allows the `SetWithDelete` key kind to be eligible for MVCC garbage consideration. This effectively allows `SetWithDelete` keys to be separated into value blocks and blob files. Fixes: #4424
1 parent 19f9afc commit 01636b7

File tree

5 files changed

+75
-34
lines changed

5 files changed

+75
-34
lines changed

internal/compact/run.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,6 @@ func (r *Runner) writeKeysToTable(
338338
isLikelyMVCCGarbage := func() bool {
339339
return sstable.IsLikelyMVCCGarbage(kv.K.UserKey, prevKeyKind, kv.K.Kind(), valueLen, prefixEqual)
340340
}
341-
prevKeyKind = kv.K.Kind()
342341
// Add the value to the sstable, possibly separating its value into a
343342
// blob file. The ValueSeparation implementation is responsible for
344343
// writing the KV to the sstable.
@@ -348,6 +347,7 @@ func (r *Runner) writeKeysToTable(
348347
if err := valueSeparation.Add(tw, kv, r.iter.ForceObsoleteDueToRangeDel(), isLikelyMVCCGarbage); err != nil {
349348
return nil, err
350349
}
350+
prevKeyKind = kv.K.Kind()
351351
if r.iter.SnapshotPinned() {
352352
// The kv pair we just added to the sstable was only surfaced by
353353
// the compaction iterator because an open snapshot prevented

sstable/colblk_writer.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1277,8 +1277,8 @@ func (w *RawColumnWriter) copyProperties(props Properties) {
12771277
//
12781278
// We require:
12791279
//
1280-
// . The previous key to be a SET.
1281-
// . The current key to be a SET.
1280+
// . The previous key to be a SET/SETWITHDEL.
1281+
// . The current key to be a SET/SETWITHDEL.
12821282
// . The value to be sufficiently large. (Currently we simply require a
12831283
// non-zero length, so all non-empty values are eligible for storage
12841284
// out-of-band in a value block.)
@@ -1294,8 +1294,11 @@ func IsLikelyMVCCGarbage(
12941294
prefixEqual func(k []byte) bool,
12951295
) bool {
12961296
const tinyValueThreshold = 0
1297-
return prevKeyKind == InternalKeyKindSet &&
1298-
keyKind == InternalKeyKindSet &&
1297+
isSetStarKind := func(k base.InternalKeyKind) bool {
1298+
return k == InternalKeyKindSet || k == InternalKeyKindSetWithDelete
1299+
}
1300+
return isSetStarKind(prevKeyKind) &&
1301+
isSetStarKind(keyKind) &&
12991302
valueLen > tinyValueThreshold &&
13001303
prefixEqual(k)
13011304
}

sstable/testdata/writer_value_blocks

Lines changed: 45 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,95 @@
11
# Size of value index is 3 bytes plus 5 + 5 = 10 bytes of trailer of the value
2-
# block and value index block. So size 18 - 13 = 5 size of the value in the
2+
# block and value index block. So size 17 - 13 = 4 size of the value in the
33
# value block.
4+
# N.B. The only key kind eligible for separation is `SET`.
45
build table-format=Pebble,v4
56
a@2.SET.1:a2
67
b@5.SET.7:b5
78
b@4.DEL.3:
89
b@3.SET.2:bat3
9-
b@2.SET.1:vbat2
10+
b@2.SETWITHDEL.1:bat2
11+
b@1.SET.1:bat1
12+
b@0.SET.0:bat0
1013
----
11-
value-blocks: num-values 1, num-blocks: 1, size: 18
14+
value-blocks: num-values 1, num-blocks: 1, size: 17
1215

1316
scan-raw
1417
----
1518
a@2#1,SET:in-place a2, same-pre false
1619
b@5#7,SET:in-place b5, same-pre false
1720
b@4#3,DEL:
1821
b@3#2,SET:in-place bat3, same-pre false
19-
b@2#1,SET:value-handle len 5 block 0 offset 0, att 5, same-pre true
22+
b@2#1,SETWITHDEL:bat2
23+
b@1#1,SET:in-place bat1, same-pre false
24+
b@0#0,SET:value-handle len 4 block 0 offset 0, att 4, same-pre true
2025

2126
scan
2227
----
2328
a@2#1,SET:a2
2429
b@5#7,SET:b5
2530
b@4#3,DEL:
2631
b@3#2,SET:bat3
27-
b@2#1,SET:vbat2
32+
b@2#1,SETWITHDEL:bat2
33+
b@1#1,SET:bat1
34+
b@0#0,SET:bat0
2835

2936
scan-cloned-lazy-values
3037
----
3138
0(in-place: len 2): a2
3239
1(in-place: len 2): b5
3340
2(in-place: len 0):
3441
3(in-place: len 4): bat3
35-
4(lazy: len 5, attr: 5): vbat2
42+
4(in-place: len 4): bat2
43+
5(in-place: len 4): bat1
44+
6(lazy: len 4, attr: 4): bat0
3645

3746
# Repeat the above test with (Pebble,v5) [columnar blocks].
47+
# N.B. The key kinds eligible for separation when writing columnar blocks are
48+
# `SET` and `SETWITHDEL`.
3849

50+
# Since we end up separating the SETWITHDEL key, this means 2 more values are
51+
# stored (each with a 4 byte value). This increases the size of the value block
52+
# by 8 bytes, making the total size 25.
3953
build table-format=Pebble,v5
4054
a@2.SET.1:a2
4155
b@5.SET.7:b5
4256
b@4.DEL.3:
4357
b@3.SET.2:bat3
44-
b@2.SET.1:vbat2
58+
b@2.SETWITHDEL.1:bat2
59+
b@1.SET.1:bat1
60+
b@0.SET.0:bat0
4561
----
46-
value-blocks: num-values 1, num-blocks: 1, size: 18
62+
value-blocks: num-values 3, num-blocks: 1, size: 25
4763

4864
scan
4965
----
5066
a@2#1,SET:a2
5167
b@5#7,SET:b5
5268
b@4#3,DEL:
5369
b@3#2,SET:bat3
54-
b@2#1,SET:vbat2
70+
b@2#1,SETWITHDEL:bat2
71+
b@1#1,SET:bat1
72+
b@0#0,SET:bat0
5573

5674
scan-cloned-lazy-values
5775
----
5876
0(in-place: len 2): a2
5977
1(in-place: len 2): b5
6078
2(in-place: len 0):
6179
3(in-place: len 4): bat3
62-
4(lazy: len 5, attr: 5): vbat2
80+
4(lazy: len 4, attr: 4): bat2
81+
5(lazy: len 4, attr: 4): bat1
82+
6(lazy: len 4, attr: 4): bat0
6383

6484
# Same data as previous, with disable-value-blocks set to true
6585
build disable-value-blocks=true table-format=Pebble,v4
6686
a@2.SET.1:a2
6787
b@5.SET.7:b5
6888
b@4.DEL.3:
6989
b@3.SET.2:bat3
70-
b@2.SET.1:vbat2
90+
b@2.SETWITHDEL.1:bat2
91+
b@1.SET.1:bat1
92+
b@0.SET.0:bat0
7193
----
7294
value-blocks: num-values 0, num-blocks: 0, size: 0
7395

@@ -77,15 +99,19 @@ a@2#1,SET:in-place a2, same-pre false
7799
b@5#7,SET:in-place b5, same-pre false
78100
b@4#3,DEL:
79101
b@3#2,SET:in-place bat3, same-pre false
80-
b@2#1,SET:in-place vbat2, same-pre true
102+
b@2#1,SETWITHDEL:bat2
103+
b@1#1,SET:in-place bat1, same-pre false
104+
b@0#0,SET:in-place bat0, same-pre true
81105

82106
scan
83107
----
84108
a@2#1,SET:a2
85109
b@5#7,SET:b5
86110
b@4#3,DEL:
87111
b@3#2,SET:bat3
88-
b@2#1,SET:vbat2
112+
b@2#1,SETWITHDEL:bat2
113+
b@1#1,SET:bat1
114+
b@0#0,SET:bat0
89115

90116
# Same as above but with (Pebble,v5) [columnar blocks].
91117

@@ -94,7 +120,9 @@ a@2.SET.1:a2
94120
b@5.SET.7:b5
95121
b@4.DEL.3:
96122
b@3.SET.2:bat3
97-
b@2.SET.1:vbat2
123+
b@2.SETWITHDEL.1:bat2
124+
b@1.SET.1:bat1
125+
b@0.SET.0:bat0
98126
----
99127
value-blocks: num-values 0, num-blocks: 0, size: 0
100128

@@ -104,7 +132,9 @@ a@2#1,SET:a2
104132
b@5#7,SET:b5
105133
b@4#3,DEL:
106134
b@3#2,SET:bat3
107-
b@2#1,SET:vbat2
135+
b@2#1,SETWITHDEL:bat2
136+
b@1#1,SET:bat1
137+
b@0#0,SET:bat0
108138

109139
# Size of value index is 3 bytes plus 5 + 5 = 10 bytes of trailer of the value
110140
# block and value index block. So size 33 - 13 = 20 is the total size of the

testdata/compaction/mvcc_garbage_blob

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,36 @@
33
define value-separation=(true, 5, 3, 0s, 1.0)
44
----
55

6-
batch
7-
set bar bar
8-
set foo foo
9-
set fuzz fuzz
10-
set yaya yaya
11-
----
12-
136
batch
147
set yay@3 a
158
set yay@2 ab
16-
set zoo@3 b
17-
set zoo@2 ba
9+
set zoo@4 b
10+
set zoo@3 ba
11+
del zoo@2
12+
set zoo@2 bag
13+
set zoo@1 bah
1814
----
1915

2016
# This flush *should* write a blob file for our MVCC garbage values, containing
21-
# 2 values: "ab" and "ba" - totaling 4 bytes of logical values.
17+
# 4 values: "ab", "ba", "bag", "bah" - totaling 10 bytes of logical values.
18+
# N.B. `del zoo@2, set zoo@2 bag` transforms into `setwithdel zoo@2 bag`.
19+
flush
20+
----
21+
L0.0:
22+
000005:[yay@3#10,SET-zoo@1#16,SET] seqnums:[10-16] points:[yay@3#10,SET-zoo@1#16,SET] size:827 blobrefs:[(B000006: 10); depth:1]
23+
Blob files:
24+
B000006 physical:{000006 size:[102 (102B)] vals:[10 (10B)]}
25+
26+
batch
27+
del yuumi@2
28+
set yuumi@1 ba
29+
----
2230

2331
flush
2432
----
33+
L0.1:
34+
000008:[yuumi@2#17,DEL-yuumi@1#18,SET] seqnums:[17-18] points:[yuumi@2#17,DEL-yuumi@1#18,SET] size:707
2535
L0.0:
26-
000005:[bar#10,SET-zoo@2#17,SET] seqnums:[10-17] points:[bar#10,SET-zoo@2#17,SET] size:834 blobrefs:[(B000006: 4); depth:1]
36+
000005:[yay@3#10,SET-zoo@1#16,SET] seqnums:[10-16] points:[yay@3#10,SET-zoo@1#16,SET] size:827 blobrefs:[(B000006: 10); depth:1]
2737
Blob files:
28-
B000006 physical:{000006 size:[94 (94B)] vals:[4 (4B)]}
38+
B000006 physical:{000006 size:[102 (102B)] vals:[10 (10B)]}

value_separation.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,6 @@ func (vs *writeNewBlobFiles) Add(
251251

252252
// Values that are too small are never separated; however, MVCC keys are
253253
// separated if they are a SET key kind, as long as the value is not empty.
254-
//
255-
// TODO(annie): Also allow SetWithDelete keys to be separated.
256254
if len(v) < vs.minimumSize && !isLikelyMVCCGarbage() {
257255
return tw.Add(kv.K, v, forceObsolete)
258256
}

0 commit comments

Comments
 (0)