Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

storage: prevent node from restarting after consistency check fatal #42401

Merged
merged 2 commits into from Nov 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
47 changes: 47 additions & 0 deletions pkg/base/store_spec.go
Expand Up @@ -13,7 +13,9 @@ package base
import (
"bytes"
"fmt"
"io/ioutil"
"net"
"os"
"path/filepath"
"regexp"
"sort"
Expand Down Expand Up @@ -357,6 +359,51 @@ func (ssl StoreSpecList) String() string {
return buffer.String()
}

// AuxiliaryDir is the path of the auxiliary dir relative to an engine.Engine's
// root directory. It must not be changed without a proper migration.
const AuxiliaryDir = "auxiliary"

// PreventedStartupFile is the filename (relative to 'dir') used for files that
// can block server startup.
func PreventedStartupFile(dir string) string {
return filepath.Join(dir, "_CRITICAL_ALERT.txt")
}

// GetPreventedStartupMessage attempts to read the PreventedStartupFile for each
// store directory and returns their concatenated contents. These files
// typically request operator intervention after a corruption event by
// preventing the affected node(s) from starting back up.
func (ssl StoreSpecList) GetPreventedStartupMessage() (string, error) {
var buf strings.Builder
for _, ss := range ssl.Specs {
path := ss.PreventedStartupFile()
if path == "" {
continue
}
b, err := ioutil.ReadFile(path)
if err != nil {
if !os.IsNotExist(err) {
return "", err
}
continue
}
fmt.Fprintf(&buf, "From %s:\n\n", path)
_, _ = buf.Write(b)
fmt.Fprintln(&buf)
}
return buf.String(), nil
}

// PreventedStartupFile returns the path to a file which, if it exists, should
// prevent the server from starting up. Returns an empty string for in-memory
// engines.
func (ss StoreSpec) PreventedStartupFile() string {
if ss.InMemory {
return ""
}
return PreventedStartupFile(filepath.Join(ss.Path, AuxiliaryDir))
}

// Type returns the underlying type in string form. This is part of pflag's
// value interface.
func (ssl *StoreSpecList) Type() string {
Expand Down
38 changes: 38 additions & 0 deletions pkg/base/store_spec_test.go
Expand Up @@ -12,13 +12,17 @@ package base_test

import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"reflect"
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/stretchr/testify/require"
)

// TestNewStoreSpec verifies that the --store arguments are correctly parsed
Expand Down Expand Up @@ -217,3 +221,37 @@ func TestJoinListType(t *testing.T) {
})
}
}

func TestStoreSpecListPreventedStartupMessage(t *testing.T) {
defer leaktest.AfterTest(t)()

dir, cleanup := testutils.TempDir(t)
defer cleanup()

boomStoreDir := filepath.Join(dir, "boom")
boomAuxDir := filepath.Join(boomStoreDir, base.AuxiliaryDir)
okStoreDir := filepath.Join(dir, "ok")
okAuxDir := filepath.Join(okStoreDir, base.AuxiliaryDir)

for _, sd := range []string{boomAuxDir, okAuxDir} {
require.NoError(t, os.MkdirAll(sd, 0755))
}

ssl := base.StoreSpecList{
Specs: []base.StoreSpec{
{Path: "foo", InMemory: true},
{Path: okStoreDir},
{Path: boomStoreDir},
},
}

msg, err := ssl.GetPreventedStartupMessage()
require.NoError(t, err)
require.Empty(t, msg)

require.NoError(t, ioutil.WriteFile(ssl.Specs[2].PreventedStartupFile(), []byte("boom"), 0644))

msg, err = ssl.GetPreventedStartupMessage()
require.NoError(t, err)
require.Contains(t, msg, "boom")
}
8 changes: 7 additions & 1 deletion pkg/cli/start.go
Expand Up @@ -437,13 +437,19 @@ func runStart(cmd *cobra.Command, args []string, disableReplication bool) error
// First things first: if the user wants background processing,
// relinquish the terminal ASAP by forking and exiting.
//
// If executing in the backround, the function returns ok == true in
// If executing in the background, the function returns ok == true in
// the parent process (regardless of err) and the parent exits at
// this point.
if ok, err := maybeRerunBackground(); ok {
return err
}

if s, err := serverCfg.Stores.GetPreventedStartupMessage(); err != nil {
return err
} else if s != "" {
log.Fatal(context.Background(), s)
}

// Set up the signal handlers. This also ensures that any of these
// signals received beyond this point do not interrupt the startup
// sequence until the point signals are checked below.
Expand Down
7 changes: 7 additions & 0 deletions pkg/storage/consistency_queue_test.go
Expand Up @@ -14,6 +14,7 @@ import (
"bytes"
"context"
"fmt"
"io/ioutil"
"math/rand"
"path/filepath"
"testing"
Expand All @@ -35,6 +36,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

// TestConsistencyQueueRequiresLive verifies the queue will not
Expand Down Expand Up @@ -356,6 +358,11 @@ func TestCheckConsistencyInconsistent(t *testing.T) {
assert.Equal(t, roachpb.CheckConsistencyResponse_RANGE_INCONSISTENT, resp.Result[0].Status)
assert.Contains(t, resp.Result[0].Detail, `[minority]`)
assert.Contains(t, resp.Result[0].Detail, `stats`)

// A death rattle should have been written on s2 (store index 1).
b, err := ioutil.ReadFile(base.PreventedStartupFile(mtc.stores[1].Engine().GetAuxiliaryDir()))
require.NoError(t, err)
require.NotEmpty(t, b)
}

// TestConsistencyQueueRecomputeStats is an end-to-end test of the mechanism CockroachDB
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/engine/pebble.go
Expand Up @@ -279,7 +279,7 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (*Pebble, error) {
return nil, err
}
} else {
auxDir = cfg.Opts.FS.PathJoin(cfg.Dir, "auxiliary")
auxDir = cfg.Opts.FS.PathJoin(cfg.Dir, base.AuxiliaryDir)
if err := cfg.Opts.FS.MkdirAll(auxDir, 0755); err != nil {
return nil, err
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/engine/rocksdb.go
Expand Up @@ -528,7 +528,7 @@ func NewRocksDB(cfg RocksDBConfig, cache RocksDBCache) (*RocksDB, error) {
cache: cache.ref(),
}

if err := r.setAuxiliaryDir(filepath.Join(cfg.Dir, "auxiliary")); err != nil {
if err := r.setAuxiliaryDir(filepath.Join(cfg.Dir, base.AuxiliaryDir)); err != nil {
return nil, err
}

Expand Down
23 changes: 23 additions & 0 deletions pkg/storage/replica_corruption.go
Expand Up @@ -12,7 +12,11 @@ package storage

import (
"context"
"fmt"
"io/ioutil"
"os"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/log"
)
Expand Down Expand Up @@ -51,6 +55,25 @@ func (r *Replica) setCorruptRaftMuLocked(
log.ErrorfDepth(ctx, 1, "stalling replica due to: %s", cErr.ErrorMsg)
cErr.Processed = true
r.mu.destroyStatus.Set(cErr, destroyReasonRemoved)

auxDir := r.store.engine.GetAuxiliaryDir()
_ = os.MkdirAll(auxDir, 0755)
path := base.PreventedStartupFile(auxDir)

preventStartupMsg := fmt.Sprintf(`ATTENTION:

this node is terminating because replica %s detected an inconsistent state.
Please contact the CockroachDB support team. It is not necessarily safe
to replace this node; cluster data may still be at risk of corruption.

A file preventing this node from restarting was placed at:
%s
`, r, path)

if err := ioutil.WriteFile(path, []byte(preventStartupMsg), 0644); err != nil {
log.Warning(ctx, err)
}

log.FatalfDepth(ctx, 1, "replica is corrupted: %s", cErr)
return roachpb.NewError(cErr)
}
29 changes: 25 additions & 4 deletions pkg/storage/replica_proposal.go
Expand Up @@ -13,12 +13,14 @@ package storage
import (
"context"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"time"
"unsafe"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
Expand Down Expand Up @@ -256,14 +258,33 @@ func (r *Replica) computeChecksumPostApply(ctx context.Context, cc storagepb.Com
// holder and thus won't be printed to the logs. Since we're already
// in a goroutine that's about to end, simply sleep for a few seconds
// and then terminate.
auxDir := r.store.engine.GetAuxiliaryDir()
_ = os.MkdirAll(auxDir, 0755)
path := base.PreventedStartupFile(auxDir)

preventStartupMsg := fmt.Sprintf(`ATTENTION:

this node is terminating because a replica inconsistency was detected between %s
and its other replicas. Please check your cluster-wide log files for more
information and contact the CockroachDB support team. It is not necessarily safe
to replace this node; cluster data may still be at risk of corruption.

A checkpoints directory to aid (expert) debugging should be present in:
%s

A file preventing this node from restarting was placed at:
%s
`, r, auxDir, path)

if err := ioutil.WriteFile(path, []byte(preventStartupMsg), 0644); err != nil {
log.Warning(ctx, err)
}

if p := r.store.cfg.TestingKnobs.ConsistencyTestingKnobs.OnBadChecksumFatal; p != nil {
p(*r.store.Ident)
} else {
time.Sleep(10 * time.Second)
log.Fatalf(r.AnnotateCtx(context.Background()),
"this node is terminating because a replica inconsistency was detected "+
"between %s and its other replicas. Please check your cluster-wide log files for "+
"more information and contact the CockroachDB support team.", r)
log.Fatalf(r.AnnotateCtx(context.Background()), preventStartupMsg)
}
}

Expand Down
5 changes: 5 additions & 0 deletions pkg/storage/replica_test.go
Expand Up @@ -16,6 +16,7 @@ import (
"fmt"
"math"
"math/rand"
"os"
"reflect"
"regexp"
"sort"
Expand Down Expand Up @@ -6409,6 +6410,10 @@ func TestReplicaCorruption(t *testing.T) {
t.Fatalf("unexpected error: %s", pErr)
}

// Should have laid down marker file to prevent startup.
_, err := os.Stat(base.PreventedStartupFile(tc.engine.GetAuxiliaryDir()))
require.NoError(t, err)

// Should have triggered fatal error.
if exitStatus != 255 {
t.Fatalf("unexpected exit status %d", exitStatus)
Expand Down