diff --git a/storage/engine/gc.go b/storage/engine/gc.go index 542ac1024eaf..31f6eab18690 100644 --- a/storage/engine/gc.go +++ b/storage/engine/gc.go @@ -44,7 +44,18 @@ func MakeGarbageCollector(now roachpb.Timestamp, policy config.GCPolicy) Garbage // garbage collection policy for batches of values for the same key. // Returns the timestamp including, and after which, all values should // be garbage collected. If no values should be GC'd, returns -// roachpb.ZeroTimestamp. +// roachpb.ZeroTimestamp. keys must be in descending time order. +// Values deleted at or before the returned timestamp can be deleted without +// invalidating any reads in the time interval (gc.expiration, \infinity). +// +// The GC keeps all values (including deletes) above the expiration time, plus +// the first value before or at the expiration time. This allows reads to be +// guaranteed as described above. However if this were the only rule, then +// if the most recent write was a delete, it would never be removed. Thus, +// when a deleted value is the most recent before expiration, it can be +// deleted. This would still allow for the tombstone bugs in #6227, so in +// the future we will add checks that disallow writes before the last GC +// expiration time. func (gc GarbageCollector) Filter(keys []MVCCKey, values [][]byte) roachpb.Timestamp { if gc.policy.TTLSeconds <= 0 { return roachpb.ZeroTimestamp @@ -54,34 +65,31 @@ func (gc GarbageCollector) Filter(keys []MVCCKey, values [][]byte) roachpb.Times } // Loop over values. All should be MVCC versions. + var i int + var key MVCCKey delTS := roachpb.ZeroTimestamp - survivors := false - for i, key := range keys { + for i, key = range keys { if !key.IsValue() { log.Errorf("unexpected MVCC metadata encountered: %q", key) return roachpb.ZeroTimestamp } - deleted := len(values[i]) == 0 - if i == 0 { - // If the first value isn't a deletion tombstone, don't consider - // it for GC. It should always survive if non-deleted. - if !deleted { - survivors = true - continue - } + if gc.expiration.Less(key.Timestamp) { + continue } - // If we encounter a version older than our GC timestamp, mark for deletion. - if key.Timestamp.Less(gc.expiration) { + // Now key.Timestamp is <= gc.expiration, but the key-value pair is still + // "visible" at timestamp gc.expiration (and up to the next version). + if deleted := len(values[i]) == 0; deleted { + // We don't have to keep a delete visible (since GCing it does not change + // the outcome of the read). Note however that we can't touch deletes at + // higher timestamps immediately preceding this one, since they're above + // gc.expiration and are needed for correctness; see #6227. delTS = key.Timestamp - break - } else if !deleted { - survivors = true + } else if i+1 < len(keys) { + // Otherwise mark the previous timestamp for deletion (since it won't ever + // be returned for reads at gc.expiration and up). + delTS = keys[i+1].Timestamp } - } - // If there are no non-deleted survivors, return timestamp of first key - // to delete all entries. - if !survivors { - return keys[0].Timestamp + break } return delTS } diff --git a/storage/engine/gc_test.go b/storage/engine/gc_test.go index 3f3a6defcd0a..fa2ffcc12f7d 100644 --- a/storage/engine/gc_test.go +++ b/storage/engine/gc_test.go @@ -58,12 +58,13 @@ func TestGarbageCollectorFilter(t *testing.T) { expDelTS roachpb.Timestamp }{ {gcA, makeTS(0, 0), aKeys, [][]byte{n, n, n}, roachpb.ZeroTimestamp}, - {gcA, makeTS(0, 0), aKeys, [][]byte{d, d, d}, makeTS(2E9, 0)}, + {gcA, makeTS(0, 0), aKeys, [][]byte{d, d, d}, roachpb.ZeroTimestamp}, {gcB, makeTS(0, 0), bKeys, [][]byte{n, n}, roachpb.ZeroTimestamp}, - {gcB, makeTS(0, 0), bKeys, [][]byte{d, d}, makeTS(2E9, 0)}, + {gcB, makeTS(0, 0), bKeys, [][]byte{d, d}, roachpb.ZeroTimestamp}, {gcA, makeTS(1E9, 0), aKeys, [][]byte{n, n, n}, roachpb.ZeroTimestamp}, {gcB, makeTS(1E9, 0), bKeys, [][]byte{n, n}, roachpb.ZeroTimestamp}, {gcA, makeTS(2E9, 0), aKeys, [][]byte{n, n, n}, roachpb.ZeroTimestamp}, + {gcA, makeTS(2E9, 0), aKeys, [][]byte{d, d, d}, makeTS(1E9, 0)}, {gcB, makeTS(2E9, 0), bKeys, [][]byte{n, n}, roachpb.ZeroTimestamp}, {gcA, makeTS(3E9, 0), aKeys, [][]byte{n, n, n}, makeTS(1E9, 1)}, {gcA, makeTS(3E9, 0), aKeys, [][]byte{d, n, n}, makeTS(2E9, 0)}, diff --git a/storage/gc_queue_test.go b/storage/gc_queue_test.go index b02578ba5532..f479224c50ca 100644 --- a/storage/gc_queue_test.go +++ b/storage/gc_queue_test.go @@ -162,6 +162,7 @@ func TestGCQueueProcess(t *testing.T) { ts1 := makeTS(now-2*24*60*60*1E9+1, 0) // 2d old (add one nanosecond so we're not using zero timestamp) ts2 := makeTS(now-25*60*60*1E9, 0) // GC will occur at time=25 hours + ts2m1 := ts2.Prev() // ts2 - 1 so we have something not right at the GC time ts3 := makeTS(now-intentAgeThreshold.Nanoseconds(), 0) // 2h old ts4 := makeTS(now-(intentAgeThreshold.Nanoseconds()-1), 0) // 2h-1ns old ts5 := makeTS(now-1E9, 0) // 1s old @@ -174,6 +175,8 @@ func TestGCQueueProcess(t *testing.T) { key7 := roachpb.Key("g") key8 := roachpb.Key("h") key9 := roachpb.Key("i") + key10 := roachpb.Key("j") + key11 := roachpb.Key("k") data := []struct { key roachpb.Key @@ -181,13 +184,13 @@ func TestGCQueueProcess(t *testing.T) { del bool txn bool }{ - // For key1, we expect first two values to GC. + // For key1, we expect first value to GC. {key1, ts1, false, false}, {key1, ts2, false, false}, {key1, ts5, false, false}, - // For key2, we expect all values to GC, because most recent is deletion. + // For key2, we expect values to GC, even though most recent is deletion. {key2, ts1, false, false}, - {key2, ts2, false, false}, + {key2, ts2m1, false, false}, // use a value < the GC time to verify it's kept {key2, ts5, true, false}, // For key3, we expect just ts1 to GC, because most recent deletion is intent. {key3, ts1, false, false}, @@ -198,7 +201,7 @@ func TestGCQueueProcess(t *testing.T) { {key4, ts2, false, false}, // For key5, expect all values to GC (most recent value deleted). {key5, ts1, false, false}, - {key5, ts2, true, false}, + {key5, ts2, true, false}, // deleted, so GC // For key6, expect no values to GC because most recent value is intent. {key6, ts1, false, false}, {key6, ts5, false, true}, @@ -209,7 +212,17 @@ func TestGCQueueProcess(t *testing.T) { {key8, ts2, false, false}, {key8, ts3, true, true}, // For key9, resolve naked intent with no remaining values. - {key9, ts3, true, false}, + {key9, ts3, false, true}, + // For key10, GC ts1 because it's a delete but not ts3 because it's above the threshold. + {key10, ts1, true, false}, + {key10, ts3, true, false}, + {key10, ts4, false, false}, + {key10, ts5, false, false}, + // For key11, we can't GC anything because ts1 isn't a delete. + {key11, ts1, false, false}, + {key11, ts3, true, false}, + {key11, ts4, true, false}, + {key11, ts5, true, false}, } for i, datum := range data { @@ -260,6 +273,9 @@ func TestGCQueueProcess(t *testing.T) { ts roachpb.Timestamp }{ {key1, ts5}, + {key1, ts2}, + {key2, ts5}, + {key2, ts2m1}, {key3, roachpb.ZeroTimestamp}, {key3, ts5}, {key3, ts2}, @@ -271,6 +287,13 @@ func TestGCQueueProcess(t *testing.T) { {key7, ts4}, {key7, ts2}, {key8, ts2}, + {key10, ts5}, + {key10, ts4}, + {key10, ts3}, + {key11, ts5}, + {key11, ts4}, + {key11, ts3}, + {key11, ts1}, } // Read data directly from engine to avoid intent errors from MVCC. kvs, err := engine.Scan(tc.store.Engine(), engine.MakeMVCCMetadataKey(key1),