Skip to content

Commit

Permalink
backupccl: check mismatched cluster regions on backup and restore
Browse files Browse the repository at this point in the history
Previously, if a restore cluster mismatched the regions in backup
cluster, the data would be restored as if the zone configuration
does not exist because we did not do the region check for users.

This patch checks the regions used by backup at restore planning
phase, and hence users are aware of mismatched regions between
backup and restore cluster. If there's a mismatched region, users
can either update cluster localities or restore with option
`--skip-localities-check` to continue.
For serverless users, they are allowed to run restore with
`--skip-localities-check` specified.

Release note (enterprise change): Previously, if a restore
cluster mismatched the regions in backup cluster, the data would
be restored as if the zone configuration does not exist.
This patch checks the regions before restore and hence users
are aware of mismatched regions between backup and restore cluster.
If there's a mismatched region, users can either update cluster
localities or restore with option `--skip-localities-check` to
continue.
  • Loading branch information
Elliebababa committed May 7, 2021
1 parent df6ab49 commit 41a08b1
Show file tree
Hide file tree
Showing 12 changed files with 269 additions and 31 deletions.
2 changes: 2 additions & 0 deletions docs/generated/sql/bnf/stmt_block.bnf
Original file line number Diff line number Diff line change
Expand Up @@ -1071,6 +1071,7 @@ unreserved_keyword ::=
| 'SHOW'
| 'SIMPLE'
| 'SKIP'
| 'SKIP_LOCALITIES_CHECK'
| 'SKIP_MISSING_FOREIGN_KEYS'
| 'SKIP_MISSING_SEQUENCES'
| 'SKIP_MISSING_SEQUENCE_OWNERS'
Expand Down Expand Up @@ -2025,6 +2026,7 @@ restore_options ::=
| 'SKIP_MISSING_SEQUENCE_OWNERS'
| 'SKIP_MISSING_VIEWS'
| 'DETACHED'
| 'SKIP_LOCALITIES_CHECK'

scrub_option_list ::=
( scrub_option ) ( ( ',' scrub_option ) )*
Expand Down
4 changes: 4 additions & 0 deletions pkg/ccl/backupccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ go_library(
"//pkg/ccl/backupccl/backupresolver",
"//pkg/ccl/storageccl",
"//pkg/ccl/utilccl",
"//pkg/config/zonepb",
"//pkg/featureflag",
"//pkg/gossip",
"//pkg/jobs",
Expand All @@ -48,6 +49,8 @@ go_library(
"//pkg/roachpb",
"//pkg/scheduledjobs",
"//pkg/security",
"//pkg/server/serverpb",
"//pkg/server/status/statuspb:statuspb_go_proto",
"//pkg/server/telemetry",
"//pkg/settings",
"//pkg/settings/cluster",
Expand Down Expand Up @@ -141,6 +144,7 @@ go_test(
"//pkg/base",
"//pkg/blobs",
"//pkg/ccl/kvccl",
"//pkg/ccl/multiregionccl",
"//pkg/ccl/partitionccl",
"//pkg/ccl/storageccl",
"//pkg/ccl/utilccl",
Expand Down
58 changes: 53 additions & 5 deletions pkg/ccl/backupccl/backup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/blobs"
_ "github.com/cockroachdb/cockroach/pkg/ccl/kvccl"
_ "github.com/cockroachdb/cockroach/pkg/ccl/multiregionccl"
_ "github.com/cockroachdb/cockroach/pkg/ccl/partitionccl"
"github.com/cockroachdb/cockroach/pkg/ccl/storageccl"
"github.com/cockroachdb/cockroach/pkg/config"
Expand Down Expand Up @@ -102,6 +103,33 @@ type datadrivenTestState struct {
cleanupFns []func()
}

var localityCfgs = map[string]roachpb.Locality{
"us-east-1": {
Tiers: []roachpb.Tier{
{Key: "region", Value: "us-east-1"},
{Key: "availability-zone", Value: "us-east1"},
},
},
"us-west-1": {
Tiers: []roachpb.Tier{
{Key: "region", Value: "us-west-1"},
{Key: "availability-zone", Value: "us-west1"},
},
},
"eu-central-1": {
Tiers: []roachpb.Tier{
{Key: "region", Value: "eu-central-1"},
{Key: "availability-zone", Value: "eu-central-1"},
},
},
"eu-north-1": {
Tiers: []roachpb.Tier{
{Key: "region", Value: "eu-north-1"},
{Key: "availability-zone", Value: "eu-north-1"},
},
},
}

func (d *datadrivenTestState) cleanup(ctx context.Context) {
for _, db := range d.sqlDBs {
db.Close()
Expand All @@ -115,7 +143,10 @@ func (d *datadrivenTestState) cleanup(ctx context.Context) {
}

func (d *datadrivenTestState) addServer(
t *testing.T, name, iodir, tempCleanupFrequency string, allowImplicitAccess bool,
t *testing.T,
name, iodir, tempCleanupFrequency string,
allowImplicitAccess bool,
localities string,
) error {
var tc serverutils.TestClusterInterface
var cleanup func()
Expand All @@ -134,10 +165,24 @@ func (d *datadrivenTestState) addServer(
sql.TempObjectCleanupInterval.Override(&settings.SV, duration)
params.ServerArgs.Settings = settings
}

clusterSize := singleNode

if localities != "" {
cfgs := strings.Split(localities, ",")
clusterSize = len(cfgs)
serverArgsPerNode := make(map[int]base.TestServerArgs)
for i, cfg := range cfgs {
param := params.ServerArgs
param.Locality = localityCfgs[cfg]
serverArgsPerNode[i] = param
}
params.ServerArgsPerNode = serverArgsPerNode
}
if iodir == "" {
_, tc, _, iodir, cleanup = backupRestoreTestSetupWithParams(t, singleNode, 0, InitManualReplication, params)
_, tc, _, iodir, cleanup = backupRestoreTestSetupWithParams(t, clusterSize, 0, InitManualReplication, params)
} else {
_, tc, _, cleanup = backupRestoreTestSetupEmptyWithParams(t, singleNode, iodir, InitManualReplication, params)
_, tc, _, cleanup = backupRestoreTestSetupEmptyWithParams(t, clusterSize, iodir, InitManualReplication, params)
}
d.servers[name] = tc.Server(0)
d.dataDirs[name] = iodir
Expand Down Expand Up @@ -219,7 +264,7 @@ func TestBackupRestoreDataDriven(t *testing.T) {
ds = newDatadrivenTestState()
return ""
case "new-server":
var name, shareDirWith, iodir, tempCleanupFrequency string
var name, shareDirWith, iodir, tempCleanupFrequency, localities string
var allowImplicitAccess bool
d.ScanArgs(t, "name", &name)
if d.HasArg("share-io-dir") {
Expand All @@ -234,8 +279,11 @@ func TestBackupRestoreDataDriven(t *testing.T) {
if d.HasArg("temp-cleanup-freq") {
d.ScanArgs(t, "temp-cleanup-freq", &tempCleanupFrequency)
}
if d.HasArg("localities") {
d.ScanArgs(t, "localities", &localities)
}
lastCreatedServer = name
err := ds.addServer(t, name, iodir, tempCleanupFrequency, allowImplicitAccess)
err := ds.addServer(t, name, iodir, tempCleanupFrequency, allowImplicitAccess, localities)
if err != nil {
return err.Error()
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/ccl/backupccl/backupresolver/targets.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ type DescriptorsMatched struct {
RequestedDBs []catalog.DatabaseDescriptor
}

// CheckExpansions determines if matched targets are covered by the specified
// CheckExpansions determines if matched targets are covered by the specified
// descriptors.
func (d DescriptorsMatched) CheckExpansions(coveredDBs []descpb.ID) error {
covered := make(map[descpb.ID]bool)
Expand Down
15 changes: 15 additions & 0 deletions pkg/ccl/backupccl/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ func backupRestoreTestSetupWithParams(
dir, dirCleanupFn := testutils.TempDir(t)
params.ServerArgs.ExternalIODir = dir
params.ServerArgs.UseDatabase = "data"
if len(params.ServerArgsPerNode) > 0 {
for i := range params.ServerArgsPerNode {
param := params.ServerArgsPerNode[i]
param.ExternalIODir = dir
param.UseDatabase = "data"
params.ServerArgsPerNode[i] = param
}
}

tc = testcluster.StartTestCluster(t, clusterSize, params)
init(tc)
Expand Down Expand Up @@ -240,6 +248,13 @@ func backupRestoreTestSetupEmptyWithParams(
ctx = context.Background()

params.ServerArgs.ExternalIODir = dir
if len(params.ServerArgsPerNode) > 0 {
for i := range params.ServerArgsPerNode {
param := params.ServerArgsPerNode[i]
param.ExternalIODir = dir
params.ServerArgsPerNode[i] = param
}
}
tc = testcluster.StartTestCluster(t, clusterSize, params)
init(tc)

Expand Down
93 changes: 93 additions & 0 deletions pkg/ccl/backupccl/restore_planning.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,25 @@ package backupccl

import (
"context"
"fmt"
"go/constant"
"net/url"
"path"
"sort"
"strconv"
"strings"

"github.com/cockroachdb/cockroach/pkg/ccl/storageccl"
"github.com/cockroachdb/cockroach/pkg/ccl/utilccl"
"github.com/cockroachdb/cockroach/pkg/config/zonepb"
"github.com/cockroachdb/cockroach/pkg/featureflag"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/server/status/statuspb"
"github.com/cockroachdb/cockroach/pkg/server/telemetry"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/sql"
Expand Down Expand Up @@ -65,6 +70,7 @@ const (
restoreOptSkipMissingSequences = "skip_missing_sequences"
restoreOptSkipMissingSequenceOwners = "skip_missing_sequence_owners"
restoreOptSkipMissingViews = "skip_missing_views"
restoreOptSkipLocalitiesCheck = "skip_localities_check"

// The temporary database system tables will be restored into for full
// cluster backups.
Expand Down Expand Up @@ -1631,6 +1637,90 @@ func checkPrivilegesForRestore(
return nil
}

func findNodeOfRegion(nodes []statuspb.NodeStatus, region descpb.RegionName) bool {
constraint := zonepb.Constraint{
Type: zonepb.Constraint_REQUIRED,
Key: "region",
Value: string(region),
}
for _, n := range nodes {
for _, store := range n.StoreStatuses {
if zonepb.StoreMatchesConstraint(store.Desc, constraint) {
return true
}
}
}
return false
}

func checkClusterRegions(
ctx context.Context,
typesByID map[descpb.ID]*typedesc.Mutable,
ss serverpb.OptionalNodesStatusServer,
skipLocalitiesCheck bool,
codec keys.SQLCodec,
) error {

if skipLocalitiesCheck {
return nil
}

regionSet := make(map[descpb.RegionName]struct{})
for _, typ := range typesByID {
typeDesc := typedesc.NewBuilder(typ.TypeDesc()).BuildImmutableType()
if typeDesc.GetKind() == descpb.TypeDescriptor_MULTIREGION_ENUM {
regionNames, err := typeDesc.RegionNames()
if err != nil {
return err
}
for _, region := range regionNames {
if _, ok := regionSet[region]; !ok {
regionSet[region] = struct{}{}
}
}
}
}

if len(regionSet) == 0 {
return nil
}

var nodeStatusServer serverpb.NodesStatusServer
var err error
if nodeStatusServer, err = ss.OptionalNodesStatusServer(sql.MultitenancyZoneCfgIssueNo); err != nil {
if !codec.ForSystemTenant() {
hintMsg := fmt.Sprintf("only the system tenant supports localities check for restore, otherwise option %q is required", restoreOptSkipLocalitiesCheck)
return errors.WithHint(err, hintMsg)
}
return err
}

var nodesResponse *serverpb.NodesResponse
nodesResponse, err = nodeStatusServer.Nodes(ctx, &serverpb.NodesRequest{})
if err != nil {
return err
}

missingRegion := make([]string, 0)
for region := range regionSet {
if nodeFound := findNodeOfRegion(nodesResponse.Nodes, region); !nodeFound {
missingRegion = append(missingRegion, string(region))
}
}

if len(missingRegion) > 0 {
mismatchErr := errors.Newf("detected a mismatch in regions between the restore cluster and the backup cluster, "+
"missing regions detected: %s.", strings.Join(missingRegion, ", "))
hintsMsg := fmt.Sprintf("there are two ways you can resolve this issue: "+
"1) update the cluster to which you're restoring to ensure that the regions present on the nodes' "+
"--locality flags match those present in the backup image, or "+
"2) restore with the %q option", restoreOptSkipLocalitiesCheck)
return errors.WithHint(mismatchErr, hintsMsg)
}

return nil
}

func doRestorePlan(
ctx context.Context,
restoreStmt *tree.Restore,
Expand Down Expand Up @@ -1821,6 +1911,9 @@ func doRestorePlan(
typesByID[desc.ID] = desc
}
}
if err := checkClusterRegions(ctx, typesByID, p.ExecCfg().NodesStatusServer, restoreStmt.Options.SkipLocalitiesCheck, p.ExecCfg().Codec); err != nil {
return err
}
filteredTablesByID, err := maybeFilterMissingViews(
tablesByID,
typesByID,
Expand Down
54 changes: 54 additions & 0 deletions pkg/ccl/backupccl/testdata/backup-restore/multiregion
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
new-server name=s1 allow-implicit-access localities=us-east-1,us-west-1,eu-central-1
----

exec-sql
CREATE DATABASE d PRIMARY REGION "us-east-1" REGIONS "us-west-1", "eu-central-1";
CREATE TABLE d.t (x INT);
INSERT INTO d.t VALUES (1), (2), (3);
----

query-sql
SELECT region FROM [SHOW REGIONS FROM DATABASE d];
----
eu-central-1
us-east-1
us-west-1

exec-sql
BACKUP DATABASE d TO 'nodelocal://1/database_backup/';
----

exec-sql
BACKUP TO 'nodelocal://1/full_cluster_backup/';
----

# A new cluster with the same locality settings.
new-server name=s2 share-io-dir=s1 allow-implicit-access localities=us-east-1,us-west-1,eu-central-1
----

exec-sql
RESTORE FROM 'nodelocal://0/full_cluster_backup/';
----

exec-sql
DROP DATABASE d;
----

exec-sql
RESTORE DATABASE d FROM 'nodelocal://0/database_backup/';
----


# A new cluster with different localities settings.
new-server name=s3 share-io-dir=s1 allow-implicit-access localities=us-east-1,eu-central-1,eu-north-1
----

exec-sql
RESTORE DATABASE d FROM 'nodelocal://0/database_backup/';
----
pq: detected a mismatch in regions between the restore cluster and the backup cluster, missing regions detected: us-west-1.

exec-sql
RESTORE FROM 'nodelocal://0/full_cluster_backup/';
----
pq: detected a mismatch in regions between the restore cluster and the backup cluster, missing regions detected: us-west-1.
Loading

0 comments on commit 41a08b1

Please sign in to comment.