diff --git a/pkg/base/config.go b/pkg/base/config.go index e29640195d43..889b2dca964d 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -268,21 +268,22 @@ var ( // defaultRaftMaxInflightBytes specifies the maximum aggregate byte size of // Raft log entries that a leader will send to a follower without hearing - // responses. + // responses. As such, it also bounds the amount of replication data buffered + // in memory on the receiver. Individual messages can still exceed this limit + // (consider the default command size limit at 64 MB). // - // Previously it was assumed that RaftMaxInflightMsgs * RaftMaxSizePerMsg is a - // proxy for the actual max inflight traffic. However, RaftMaxSizePerMsg is - // not a hard limit, it's rather a "target" size for the message, so the - // actual inflight bytes could exceed this product by a large factor. - // RaftMaxInflightBytes is a more accurate limit, and should be used in - // conjunction with the two. + // Normally, RaftMaxInflightMsgs * RaftMaxSizePerMsg will bound this at 4 MB + // (128 messages at 32 KB each). However, individual messages are allowed to + // exceed the 32 KB limit, typically large AddSSTable commands that can be + // around 10 MB each. To prevent followers running out of memory, we place an + // additional total byte limit of 32 MB, which is 8 times more than normal. + // This was found to significantly reduce OOM incidence during RESTORE with + // overloaded disks. // - // TODO(#90314): lower this limit to something close to max rates observed in - // healthy clusters. Currently, this is a conservatively large multiple of - // defaultRaftMaxInflightMsgs * defaultRaftMaxSizePerMsg, so that we don't - // abruptly break the previous assumption and cut off traffic. + // Per the bandwidth-delay product, this will limit per-range throughput to + // 64 MB/s at 500 ms RTT, 320 MB/s at 100 ms RTT, and 3.2 GB/s at 10 ms RTT. defaultRaftMaxInflightBytes = envutil.EnvOrDefaultBytes( - "COCKROACH_RAFT_MAX_INFLIGHT_BYTES", 256<<20 /* 256 MB */) + "COCKROACH_RAFT_MAX_INFLIGHT_BYTES", 32<<20 /* 32 MB */) ) // Config is embedded by server.Config. A base config is not meant to be used @@ -531,6 +532,8 @@ type RaftConfig struct { // RaftMaxInflightBytes controls the maximum aggregate byte size of Raft log // entries that a leader will send to a follower without hearing responses. + // As such, it also bounds the amount of replication data buffered in memory + // on the receiver. // // Normally RaftMaxSizePerMsg * RaftMaxInflightMsgs is the actual limit. But // the RaftMaxSizePerMsg is soft, and Raft may send individual messages diff --git a/pkg/base/config_test.go b/pkg/base/config_test.go index 2ea4e519128e..ba1b459eee8c 100644 --- a/pkg/base/config_test.go +++ b/pkg/base/config_test.go @@ -135,12 +135,7 @@ func TestRaftMaxInflightBytes(t *testing.T) { maxInfl uint64 want uint64 }{ - // If any of these tests fail, sync the corresponding default values with - // config.go, and update the comments that reason about default values. - {want: 256 << 20}, // assert 255 MB is still default - {maxMsgs: 128, want: 256 << 20}, // assert 128 is still default - {msgSize: 32 << 10, want: 256 << 20}, // assert 32 KB is still default - + {want: 32 << 20}, // default {maxMsgs: 1 << 30, want: 1 << 45}, // large maxMsgs {msgSize: 1 << 50, want: 1 << 57}, // large msgSize diff --git a/pkg/base/testdata/raft_config b/pkg/base/testdata/raft_config index a3a7e0fd9a37..ce105930a741 100644 --- a/pkg/base/testdata/raft_config +++ b/pkg/base/testdata/raft_config @@ -13,7 +13,7 @@ echo RaftMaxSizePerMsg: (uint64) 32768, RaftMaxCommittedSizePerReady: (uint64) 67108864, RaftMaxInflightMsgs: (int) 128, - RaftMaxInflightBytes: (uint64) 268435456, + RaftMaxInflightBytes: (uint64) 33554432, RaftDelaySplitToSuppressSnapshot: (time.Duration) 45s } RaftHeartbeatInterval: 1s