From 8136e8ec9847418384e0900f80d3f5320ca70b52 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Thu, 13 Apr 2023 13:11:46 +0000 Subject: [PATCH] base: reduce `RaftMaxInflightBytes` to 32 MB This patch reduces the default `RaftMaxInflightBytes` from 256 MB to 32 MB, to reduce the out-of-memory incidence during bulk operations like `RESTORE` on clusters with overloaded disks. `RaftMaxInflightBytes` specifies the maximum aggregate byte size of Raft log entries that a leader will send to a follower without hearing responses. As such, it also bounds the amount of replication data buffered in memory on the receiver. Individual messages can still exceed this limit (consider the default command size limit at 64 MB). Normally, `RaftMaxInflightMsgs` * `RaftMaxSizePerMsg` will bound this at 4 MB (128 messages at 32 KB each). However, individual messages are allowed to exceed the 32 KB limit, typically large AddSSTable commands that can be around 10 MB each. To prevent followers running out of memory, we place an additional total byte limit of 32 MB, which is 8 times more than normal. A survey of CC clusters over the past 30 days showed that, excluding a single outlier cluster, the total outstanding `raft.rcvd.queued_bytes` of any individual node never exceeded 500 MB, and was roughly 0 across all clusters for the majority of time. Epic: none Release note (ops change): the amount of replication traffic in flight from a single Raft leader to a follower has been reduced from 256 MB to 32 MB, in order to reduce the chance of running out of memory during bulk write operations. This can be controlled via the environment variable `COCKROACH_RAFT_MAX_INFLIGHT_BYTES`. --- pkg/base/config.go | 27 +++++++++++++++------------ pkg/base/config_test.go | 7 +------ pkg/base/testdata/raft_config | 2 +- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/pkg/base/config.go b/pkg/base/config.go index e29640195d43..889b2dca964d 100644 --- a/pkg/base/config.go +++ b/pkg/base/config.go @@ -268,21 +268,22 @@ var ( // defaultRaftMaxInflightBytes specifies the maximum aggregate byte size of // Raft log entries that a leader will send to a follower without hearing - // responses. + // responses. As such, it also bounds the amount of replication data buffered + // in memory on the receiver. Individual messages can still exceed this limit + // (consider the default command size limit at 64 MB). // - // Previously it was assumed that RaftMaxInflightMsgs * RaftMaxSizePerMsg is a - // proxy for the actual max inflight traffic. However, RaftMaxSizePerMsg is - // not a hard limit, it's rather a "target" size for the message, so the - // actual inflight bytes could exceed this product by a large factor. - // RaftMaxInflightBytes is a more accurate limit, and should be used in - // conjunction with the two. + // Normally, RaftMaxInflightMsgs * RaftMaxSizePerMsg will bound this at 4 MB + // (128 messages at 32 KB each). However, individual messages are allowed to + // exceed the 32 KB limit, typically large AddSSTable commands that can be + // around 10 MB each. To prevent followers running out of memory, we place an + // additional total byte limit of 32 MB, which is 8 times more than normal. + // This was found to significantly reduce OOM incidence during RESTORE with + // overloaded disks. // - // TODO(#90314): lower this limit to something close to max rates observed in - // healthy clusters. Currently, this is a conservatively large multiple of - // defaultRaftMaxInflightMsgs * defaultRaftMaxSizePerMsg, so that we don't - // abruptly break the previous assumption and cut off traffic. + // Per the bandwidth-delay product, this will limit per-range throughput to + // 64 MB/s at 500 ms RTT, 320 MB/s at 100 ms RTT, and 3.2 GB/s at 10 ms RTT. defaultRaftMaxInflightBytes = envutil.EnvOrDefaultBytes( - "COCKROACH_RAFT_MAX_INFLIGHT_BYTES", 256<<20 /* 256 MB */) + "COCKROACH_RAFT_MAX_INFLIGHT_BYTES", 32<<20 /* 32 MB */) ) // Config is embedded by server.Config. A base config is not meant to be used @@ -531,6 +532,8 @@ type RaftConfig struct { // RaftMaxInflightBytes controls the maximum aggregate byte size of Raft log // entries that a leader will send to a follower without hearing responses. + // As such, it also bounds the amount of replication data buffered in memory + // on the receiver. // // Normally RaftMaxSizePerMsg * RaftMaxInflightMsgs is the actual limit. But // the RaftMaxSizePerMsg is soft, and Raft may send individual messages diff --git a/pkg/base/config_test.go b/pkg/base/config_test.go index 2ea4e519128e..ba1b459eee8c 100644 --- a/pkg/base/config_test.go +++ b/pkg/base/config_test.go @@ -135,12 +135,7 @@ func TestRaftMaxInflightBytes(t *testing.T) { maxInfl uint64 want uint64 }{ - // If any of these tests fail, sync the corresponding default values with - // config.go, and update the comments that reason about default values. - {want: 256 << 20}, // assert 255 MB is still default - {maxMsgs: 128, want: 256 << 20}, // assert 128 is still default - {msgSize: 32 << 10, want: 256 << 20}, // assert 32 KB is still default - + {want: 32 << 20}, // default {maxMsgs: 1 << 30, want: 1 << 45}, // large maxMsgs {msgSize: 1 << 50, want: 1 << 57}, // large msgSize diff --git a/pkg/base/testdata/raft_config b/pkg/base/testdata/raft_config index a3a7e0fd9a37..ce105930a741 100644 --- a/pkg/base/testdata/raft_config +++ b/pkg/base/testdata/raft_config @@ -13,7 +13,7 @@ echo RaftMaxSizePerMsg: (uint64) 32768, RaftMaxCommittedSizePerReady: (uint64) 67108864, RaftMaxInflightMsgs: (int) 128, - RaftMaxInflightBytes: (uint64) 268435456, + RaftMaxInflightBytes: (uint64) 33554432, RaftDelaySplitToSuppressSnapshot: (time.Duration) 45s } RaftHeartbeatInterval: 1s