Skip to content

Commit e4c51dc

Browse files
committed
db: permit flushable-ingest when ElevateWriteStallThresholdForFailover is true
That is, we ignore the length of the flushableList which normally decides whether a flushable-ingest is permitted. Fixes #4944
1 parent b9786c4 commit e4c51dc

File tree

4 files changed

+75
-13
lines changed

4 files changed

+75
-13
lines changed

db.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2630,15 +2630,6 @@ func (d *DB) maybeInduceWriteStall(b *Batch) {
26302630
for i := range d.mu.mem.queue {
26312631
size += d.mu.mem.queue[i].totalBytes()
26322632
}
2633-
// If ElevateWriteStallThresholdForFailover is true, we give an
2634-
// unlimited memory budget for memtables. This is simpler than trying to
2635-
// configure an explicit value, given that memory resources can vary.
2636-
// When using WAL failover in CockroachDB, an OOM risk is worth
2637-
// tolerating for workloads that have a strict latency SLO. Also, an
2638-
// unlimited budget here does not mean that the disk stall in the
2639-
// primary will go unnoticed until the OOM -- CockroachDB is monitoring
2640-
// disk stalls, and we expect it to fail the node after ~60s if the
2641-
// primary is stalled.
26422633
if size >= uint64(d.opts.MemTableStopWritesThreshold)*d.opts.MemTableSize &&
26432634
!d.mu.log.manager.ElevateWriteStallThresholdForFailover() {
26442635
// We have filled up the current memtable, but already queued memtables

ingest.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1592,7 +1592,19 @@ func (d *DB) ingest(ctx context.Context, args ingestArgs) (IngestOperationStats,
15921592
// files.
15931593
hasRemoteFiles := len(shared) > 0 || len(external) > 0
15941594
canIngestFlushable := d.FormatMajorVersion() >= FormatFlushableIngest &&
1595-
(len(d.mu.mem.queue) < d.opts.MemTableStopWritesThreshold) &&
1595+
// We require that either the queue of flushables is below the
1596+
// stop-writes threshold (note that this is typically a conservative
1597+
// check, since not every element of this queue will contribute the full
1598+
// memtable memory size that could result in a write stall), or WAL
1599+
// failover is permitting an unlimited queue without causing a write
1600+
// stall. The latter condition is important to avoid delays in
1601+
// visibility of concurrent writes that happen to get a sequence number
1602+
// after this ingest and then must wait for this ingest that is itself
1603+
// waiting on a large flush. See
1604+
// https://github.com/cockroachdb/pebble/issues/4944 for an illustration
1605+
// of this problem.
1606+
(len(d.mu.mem.queue) < d.opts.MemTableStopWritesThreshold ||
1607+
d.mu.log.manager.ElevateWriteStallThresholdForFailover()) &&
15961608
!d.opts.Experimental.DisableIngestAsFlushable() && !hasRemoteFiles &&
15971609
(!args.ExciseSpan.Valid() || d.FormatMajorVersion() >= FormatFlushableIngestExcises)
15981610

testdata/flushable_ingest

Lines changed: 53 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,44 @@ set d 1
2727
blockFlush
2828
----
2929

30+
# One WAL, corresponding to the mutable memtable containing the set.
31+
ls
32+
----
33+
000002.log
34+
LOCK
35+
MANIFEST-000001
36+
OPTIONS-000003
37+
ext
38+
ext1
39+
ext2
40+
ext3
41+
marker.format-version.000011.024
42+
marker.manifest.000001.MANIFEST-000001
43+
44+
# Ingest can complete despite the flush being blocked.
3045
ingest ext1 ext2 ext3
3146
----
3247

48+
lsm
49+
----
50+
51+
# Two more WALs, one for the flushable ingest, and one for the mutable
52+
# memtable.
53+
ls
54+
----
55+
000002.log
56+
000004.sst
57+
000005.sst
58+
000006.sst
59+
000007.log
60+
000008.log
61+
LOCK
62+
MANIFEST-000001
63+
OPTIONS-000003
64+
ext
65+
marker.format-version.000011.024
66+
marker.manifest.000001.MANIFEST-000001
67+
3368
allowFlush
3469
----
3570

@@ -45,9 +80,10 @@ a:1
4580
b:1
4681
d:1
4782

48-
# We expect 1 WAL for an immutable memtable, 1 file for the ingested ssts,
49-
# one for the mutable memtable. We also expect 3 ssts corresponding to the
50-
# ingested files.
83+
# We expect 1 WAL for an immutable memtable, 1 WAL for the ingested ssts, one
84+
# for the mutable memtable. We also expect 3 ssts corresponding to the
85+
# ingested files. The assumption here is that this command runs before the
86+
# flush completes.
5187
ls
5288
----
5389
000002.log
@@ -163,6 +199,20 @@ blockFlush
163199
ingest ext4
164200
----
165201

202+
ls
203+
----
204+
000002.log
205+
000004.sst
206+
000005.log
207+
000006.log
208+
LOCK
209+
MANIFEST-000001
210+
OPTIONS-000003
211+
ext
212+
ext5
213+
marker.format-version.000011.024
214+
marker.manifest.000001.MANIFEST-000001
215+
166216
allowFlush
167217
----
168218

wal/wal.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,15 @@ type Manager interface {
349349
// ElevateWriteStallThresholdForFailover returns true if the caller should
350350
// use a high write stall threshold because the WALs are being written to
351351
// the secondary dir.
352+
//
353+
// In practice, if this value is true, we give an unlimited memory budget
354+
// for memtables. This is simpler than trying to configure an explicit
355+
// value, given that memory resources can vary. When using WAL failover in
356+
// CockroachDB, an OOM risk is worth tolerating for workloads that have a
357+
// strict latency SLO. Also, an unlimited budget here does not mean that the
358+
// disk stall in the primary will go unnoticed until the OOM -- CockroachDB
359+
// is monitoring disk stalls, and we expect it to fail the node after ~60s
360+
// if the primary is stalled.
352361
ElevateWriteStallThresholdForFailover() bool
353362
// Stats returns the latest Stats.
354363
Stats() Stats

0 commit comments

Comments
 (0)