storage: add WAL failover

Introduce support for configuring a multi-store CockroachDB node to failover a store's write-ahead log (WAL) to another store's data directory. Failing over the write-ahead log may allow some operations against a store to continue to complete despite temporary unavailability of the underlying storage. Customers must opt into WAL failover by passing `--wal-failover=among-stores` to `cockroach start` or setting the env var `COCKROACH_WAL_FAILOVER=among-stores`. On start, cockroach will assign each store another store to be its failover destination. Cockroach will begin monitoring the latency of all WAL writes. If latency to the WAL exceeds the value of the storage.wal_failover.unhealthy_op_threshold cluster setting, Cockroach will attempt to write WAL entries to its secondary store's volume. If a user wishes to disable WAL failover, they must restart the node setting `--wal-failover=disabled`. Close #119418. Informs cockroachdb/pebble#3230 Epic: CRDB-35401 Release note (ops change): Introduces a new start option (--wal-failover or COCKROACH_WAL_FAILOVER env var) to opt into failing over WALs between stores in multi-store nodes. Introduces a new storage.wal_failover.unhealthy_op_threshold cluster setting for configuring the latency threshold at which a WAL write is considered unhealthy.
cockroachdb · Mar 15, 2024 · 96de542 · 96de542
1 parent 61f1491
commit 96de542
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 1 deletion.
diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html
@@ -282,6 +282,7 @@
 <tr><td><div id="setting-storage-max-sync-duration" class="anchored"><code>storage.max_sync_duration</code></div></td><td>duration</td><td><code>20s</code></td><td>maximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crash</td><td>Serverless/Dedicated/Self-Hosted (read-only)</td></tr>
 <tr><td><div id="setting-storage-max-sync-duration-fatal-enabled" class="anchored"><code>storage.max_sync_duration.fatal.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if true, fatal the process when a disk operation exceeds storage.max_sync_duration</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-storage-value-blocks-enabled" class="anchored"><code>storage.value_blocks.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to enable writing of value blocks in sstables</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
+<tr><td><div id="setting-storage-wal-failover-unhealthy-op-threshold" class="anchored"><code>storage.wal_failover.unhealthy_op_threshold</code></div></td><td>duration</td><td><code>100ms</code></td><td>the latency of a WAL write considered unhealthy and triggers a failover to a secondary WAL location</td><td>Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-timeseries-storage-enabled" class="anchored"><code>timeseries.storage.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, periodic timeseries data is stored within the cluster; disabling is not recommended unless you are storing the data elsewhere</td><td>Dedicated/Self-Hosted</td></tr>
 <tr><td><div id="setting-timeseries-storage-resolution-10s-ttl" class="anchored"><code>timeseries.storage.resolution_10s.ttl</code></div></td><td>duration</td><td><code>240h0m0s</code></td><td>the maximum age of time series data stored at the 10 second resolution. Data older than this is subject to rollup and deletion.</td><td>Serverless/Dedicated/Self-Hosted (read-only)</td></tr>
 <tr><td><div id="setting-timeseries-storage-resolution-30m-ttl" class="anchored"><code>timeseries.storage.resolution_30m.ttl</code></div></td><td>duration</td><td><code>2160h0m0s</code></td><td>the maximum age of time series data stored at the 30 minute resolution. Data older than this is subject to deletion.</td><td>Serverless/Dedicated/Self-Hosted (read-only)</td></tr>

diff --git a/pkg/base/config.go b/pkg/base/config.go
@@ -25,6 +25,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/util/envutil"
 	"github.com/cockroachdb/cockroach/pkg/util/mon"
 	"github.com/cockroachdb/cockroach/pkg/util/retry"
+	"github.com/cockroachdb/errors"
 	"github.com/cockroachdb/redact"
 )
 
@@ -837,6 +838,64 @@ type TempStorageConfig struct {
 	Settings *cluster.Settings
 }
 
+// WALFailoverMode configures a node's stores behavior under high write latency
+// to their write-ahead logs.
+type WALFailoverMode int8
+
+const (
+	// WALFailoverDefault leaves the WAL failover configuration unspecified. Today
+	// this is interpreted as FailoverDisabled but future releases may default to
+	// another mode.
+	WALFailoverDefault WALFailoverMode = iota
+	// WALFailoverDisabled leaves WAL failover disabled. Commits to the storage
+	// engine observe the latency of a store's primary WAL directly.
+	WALFailoverDisabled
+	// WALFailoverAmongStores enables WAL failover among multiple stores within a
+	// node. This setting has no effect if the node has a single store. When a
+	// storage engine observes high latency writing to its WAL, it may
+	// transparently failover to an arbitrary, predetermined other store's data
+	// directory. If successful in syncing log entries to the other store's
+	// volume, the batch commit latency is insulated from the effects of momentary
+	// disk stalls.
+	WALFailoverAmongStores
+)
+
+// Type implements the pflag.Value interface.
+func (m *WALFailoverMode) Type() string { return "string" }
+
+// String implements fmt.Stringer.
+func (m *WALFailoverMode) String() string {
+	return redact.StringWithoutMarkers(m)
+}
+
+// SafeFormat implements the refact.SafeFormatter interface.
+func (m *WALFailoverMode) SafeFormat(p redact.SafePrinter, _ rune) {
+	switch *m {
+	case WALFailoverDefault:
+		// Empty
+	case WALFailoverDisabled:
+		p.SafeString("disabled")
+	case WALFailoverAmongStores:
+		p.SafeString("among-stores")
+	default:
+		p.Printf("<unknown WALFailoverMode %d>", int8(*m))
+	}
+}
+
+// Set implements the pflag.Value interface.
+func (m *WALFailoverMode) Set(s string) error {
+	switch s {
+	case "disabled":
+		*m = WALFailoverDisabled
+	case "among-stores":
+		*m = WALFailoverAmongStores
+	default:
+		return errors.Newf("invalid --wal-failover setting: %s "+
+			"(possible values: disabled, among-stores)", s)
+	}
+	return nil
+}
+
 // ExternalIODirConfig describes various configuration options pertaining
 // to external storage implementations.
 // TODO(adityamaru): Rename ExternalIODirConfig to ExternalIOConfig because it

diff --git a/pkg/cli/cliflags/flags.go b/pkg/cli/cliflags/flags.go
@@ -1001,6 +1001,23 @@ which use 'cockroach-data-tenant-X' for tenant 'X')
 `,
 	}
 
+	WALFailover = FlagInfo{
+		Name:   "wal-failover",
+		EnvVar: "COCKROACH_WAL_FAILOVER",
+		Description: `
+Configures the use and behavior of WAL failover. Defaults to "disabled".
+The value "among-stores" enables automatic failover to another store's
+data directory if a WAL write does not complete within the configured
+threshold. For example:
+<PRE>
+
+  --wal-failover=among-stores
+
+</PRE>
+See the storage.wal_failover.unhealthy_op_threshold cluster setting.
+`,
+	}
+
 	StorageEngine = FlagInfo{
 		Name: "storage-engine",
 		Description: `

diff --git a/pkg/cli/flags.go b/pkg/cli/flags.go
@@ -472,6 +472,7 @@ func init() {
 
 		cliflagcfg.VarFlag(f, &storeSpecs, cliflags.Store)
 		cliflagcfg.VarFlag(f, &serverCfg.StorageEngine, cliflags.StorageEngine)
+		cliflagcfg.VarFlag(f, &serverCfg.WALFailover, cliflags.WALFailover)
 		cliflagcfg.StringFlag(f, &serverCfg.SharedStorage, cliflags.SharedStorage)
 		cliflagcfg.VarFlag(f, &serverCfg.SecondaryCache, cliflags.SecondaryCache)
 		cliflagcfg.VarFlag(f, &serverCfg.MaxOffset, cliflags.MaxOffset)

diff --git a/pkg/server/config.go b/pkg/server/config.go
@@ -226,6 +226,10 @@ type BaseConfig struct {
 	// Stores is specified to enable durable key-value storage.
 	Stores base.StoreSpecList
 
+	// WALFailover enables and configures automatic WAL failover when latency to
+	// a store's primary WAL increases.
+	WALFailover base.WALFailoverMode
+
 	// SharedStorage is specified to enable disaggregated shared storage.
 	SharedStorage                    string
 	EarlyBootExternalStorageAccessor *cloud.EarlyBootExternalStorageAccessor
@@ -307,6 +311,7 @@ func (cfg *BaseConfig) SetDefaults(
 	cfg.DisableMaxOffsetCheck = false
 	cfg.DefaultZoneConfig = zonepb.DefaultZoneConfig()
 	cfg.StorageEngine = storage.DefaultStorageEngine
+	cfg.WALFailover = base.WALFailoverDefault
 	cfg.TestingInsecureWebAccess = disableWebLogin
 	cfg.Stores = base.StoreSpecList{
 		Specs: []base.StoreSpec{storeSpec},
@@ -764,6 +769,7 @@ func (cfg *Config) CreateEngines(ctx context.Context) (Engines, error) {
 		log.Eventf(ctx, "initializing %+v", spec)
 
 		storageConfigOpts := []storage.ConfigOption{
+			walFailoverConfig,
 			storage.Attributes(spec.Attributes),
 			storage.If(storeKnobs.SmallEngineBlocks, storage.BlockSize(1)),
 		}

diff --git a/pkg/storage/BUILD.bazel b/pkg/storage/BUILD.bazel
@@ -92,6 +92,7 @@ go_library(
         "@com_github_cockroachdb_pebble//replay",
         "@com_github_cockroachdb_pebble//sstable",
         "@com_github_cockroachdb_pebble//vfs",
+        "@com_github_cockroachdb_pebble//wal",
         "@com_github_cockroachdb_redact//:redact",
         "@com_github_cockroachdb_redact//interfaces",
         "@com_github_dustin_go_humanize//:go-humanize",

diff --git a/pkg/storage/open.go b/pkg/storage/open.go
@@ -11,14 +11,20 @@
 package storage
 
 import (
+	"cmp"
 	"context"
+	"slices"
+	"time"
 
+	"github.com/cockroachdb/cockroach/pkg/base"
 	"github.com/cockroachdb/cockroach/pkg/cloud"
+	"github.com/cockroachdb/cockroach/pkg/clusterversion"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
 	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
 	"github.com/cockroachdb/cockroach/pkg/storage/fs"
 	"github.com/cockroachdb/errors"
 	"github.com/cockroachdb/pebble"
+	"github.com/cockroachdb/pebble/wal"
 )
 
 // A ConfigOption may be passed to Open to configure the storage engine.
@@ -204,6 +210,100 @@ func LBaseMaxBytes(v int64) ConfigOption {
 	}
 }
 
+// WALFailover configures automatic failover of the engine's write-ahead log to
+// another volume in the event the WAL becomes blocked on a write that does not
+// complete within a reasonable duration.
+func WALFailover(mode base.WALFailoverMode, storeEnvs fs.Envs) ConfigOption {
+	// If the user specified no WAL failover setting, we default to disabling WAL
+	// failover and assume that the previous process did not have WAL failover
+	// enabled (so there's no need to populate Options.WALRecoveryDirs). If an
+	// operator had WAL failover enabled and now wants to disable it, they must
+	// explicitly set --wal-failover=disabled for the next process.
+	if mode == base.WALFailoverDefault || len(storeEnvs) == 1 {
+		return func(cfg *engineConfig) error { return nil }
+	}
+	// mode == WALFailoverDisabled or WALFailoverAmongStores.
+
+	// For each store, we need to determine which store is its secondary for the
+	// purpose of WALs. Even if failover is disabled, it's possible that it wasn't
+	// when the previous process ran, and the secondary's wal dir may have WALs
+	// that need to be replayed.
+	//
+	// To assign secondaries, we sort by path and dictate that the next store in
+	// the slice is the secondary. Note that in-memory stores may not have unique
+	// paths, in which case we fall back to using the ordering of the store flags
+	// (which falls out of the use of a stable sort).
+	//
+	// TODO(jackson): Using the path is a simple way to assign secondaries, but
+	// it's not resilient to changing between absolute and relative paths,
+	// introducing symlinks, etc. Since we have the fs.Envs already available, we
+	// could peek into the data directories, find the most recent OPTIONS file and
+	// parse out the previous secondary if any. If we had device nos and inodes
+	// available, we could deterministically sort by those instead.
+	sortedEnvs := slices.Clone(storeEnvs)
+	slices.SortStableFunc(sortedEnvs, func(a, b *fs.Env) int {
+		return cmp.Compare(a.Dir, b.Dir)
+	})
+
+	indexOfEnv := func(e *fs.Env) (int, bool) {
+		for i := range sortedEnvs {
+			if sortedEnvs[i] == e {
+				return i, true
+			}
+		}
+		return 0, false
+	}
+	return func(cfg *engineConfig) error {
+		// Find the Env being opened in the slice of sorted envs.
+		idx, ok := indexOfEnv(cfg.Env)
+		if !ok {
+			panic(errors.AssertionFailedf("storage: opening a store with an unrecognized filesystem Env (dir=%s)", cfg.Env.Dir))
+		}
+		failoverIdx := (idx + 1) % len(sortedEnvs)
+		secondaryEnv := sortedEnvs[failoverIdx]
+		// Ref once to ensure the secondary Env isn't closed before this Engine has
+		// been closed if the secondary's corresponding Engine is closed first.
+		secondaryEnv.Ref()
+		cfg.onClose = append(cfg.onClose, func(p *Pebble) {
+			// Release the reference.
+			secondaryEnv.Close()
+		})
+
+		secondaryFS := secondaryEnv.DefaultFS
+		secondary := wal.Dir{
+			FS: secondaryFS,
+			// Use auxiliary/wals-among-stores within the other stores directory.
+			Dirname: secondaryFS.PathJoin(secondaryEnv.Dir, base.AuxiliaryDir, "wals-among-stores"),
+		}
+
+		if mode == base.WALFailoverAmongStores {
+			cfg.Opts.WALFailover = &pebble.WALFailoverOptions{
+				Secondary: secondary,
+				FailoverOptions: wal.FailoverOptions{
+					// Leave most the options to their defaults, but
+					// UnhealthyOperationLatencyThreshold should be pulled from the
+					// cluster setting.
+					UnhealthyOperationLatencyThreshold: func() (time.Duration, bool) {
+						// WAL failover requires 24.1 to be finalized first. Otherwise, we might
+						// write WALs to a secondary, downgrade to a previous version's binary and
+						// blindly miss WALs. The second return value indicates whether the
+						// WAL manager is allowed to failover to the secondary.
+						//
+						// NB: We do not use settings.Version.IsActive because we do not have a
+						// guarantee that the cluster version has been initialized.
+						failoverOK := cfg.Settings.Version.ActiveVersionOrEmpty(context.TODO()).IsActive(clusterversion.V24_1Start)
+						return walFailoverUnhealthyOpThreshold.Get(&cfg.Settings.SV), failoverOK
+					},
+				},
+			}
+			return nil
+		}
+		// mode == WALFailoverDisabled
+		cfg.Opts.WALRecoveryDirs = append(cfg.Opts.WALRecoveryDirs, secondary)
+		return nil
+	}
+}
+
 // PebbleOptions contains Pebble-specific options in the same format as a
 // Pebble OPTIONS file. For example:
 // [Options]
@@ -240,7 +340,7 @@ type engineConfig struct {
 // Open opens a new Pebble storage engine, reading and writing data to the
 // provided fs.Env, configured with the provided options.
 //
-// If succesful, the returned Engine takes ownership over the provided fs.Env's
+// If successful, the returned Engine takes ownership over the provided fs.Env's
 // reference. When the Engine is closed, the fs.Env is closed once too. If the
 // Env must be retained beyond the Engine's lifetime, the caller should Ref() it
 // first.

diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go
@@ -205,6 +205,14 @@ var SingleDeleteCrashOnIneffectual = settings.RegisterBoolSetting(
 	settings.WithVisibility(settings.Reserved),
 )
 
+var walFailoverUnhealthyOpThreshold = settings.RegisterDurationSetting(
+	settings.SystemOnly,
+	"storage.wal_failover.unhealthy_op_threshold",
+	"the latency of a WAL write considered unhealthy and triggers a failover to a secondary WAL location",
+	100*time.Millisecond,
+	settings.WithPublic,
+)
+
 // ShouldUseEFOS returns true if either of the UseEFOS or UseExciseForSnapshots
 // cluster settings are enabled, and EventuallyFileOnlySnapshots must be used
 // to guarantee snapshot-like semantics.