Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
116784: regionliveness: add support for sqlinstances and recovery r=fqazi a=fqazi

This PR completes the regionliveness survivability goal work by implementing the following:

1. Logic to bound the sqlliveness reviewals based on the unavailable_at time set on a region
2. Updating the sqlinstance allocation logic to take into account regionliveness by working on live regions only.
3. Updating the change feed initial scan on sqlinstances to work on a per-region basis and consider region liveness.
4. Updating the sqlinstance allocation logic to recover from region failures by cleaning up system.leases, system.sqlinstances, and system.sqllivness after a failure.
5. A new roachtest focused on setting up a physical cluster and simulating failure scenarios for nightly builds.
6. A synthetic test for simulating region failures and recovery from them

This PR is stacked on top of:  #115568, so the first 4 commits should be ignored.

informs: #103727
EPIC: CC-24173

118578: roachtest: fix typo in query_comparison_util.go r=mgartner a=mgartner

Epic: None

Release note: None

Co-authored-by: Faizan Qazi <faizan@cockroachlabs.com>
Co-authored-by: Marcus Gartner <marcus@cockroachlabs.com>
  • Loading branch information
3 people committed Feb 1, 2024
3 parents 829c925 + bacfa57 + ca7cb95 commit 7b66dc4
Show file tree
Hide file tree
Showing 18 changed files with 701 additions and 123 deletions.
4 changes: 4 additions & 0 deletions pkg/ccl/multiregionccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ go_test(
"//pkg/kv",
"//pkg/kv/kvbase",
"//pkg/kv/kvclient/kvcoord",
"//pkg/kv/kvpb",
"//pkg/kv/kvserver",
"//pkg/kv/kvserver/allocator/allocatorimpl",
"//pkg/roachpb",
Expand All @@ -75,6 +76,7 @@ go_test(
"//pkg/sql/catalog/descs",
"//pkg/sql/catalog/desctestutils",
"//pkg/sql/catalog/lease",
"//pkg/sql/catalog/systemschema",
"//pkg/sql/catalog/typedesc",
"//pkg/sql/enum",
"//pkg/sql/execinfra",
Expand All @@ -86,6 +88,7 @@ go_test(
"//pkg/sql/sem/tree",
"//pkg/sql/sqlinstance/instancestorage",
"//pkg/sql/sqlliveness",
"//pkg/sql/sqlliveness/slbase",
"//pkg/sql/sqlliveness/slstorage",
"//pkg/sql/sqltestutils",
"//pkg/testutils",
Expand All @@ -96,6 +99,7 @@ go_test(
"//pkg/testutils/sqlutils",
"//pkg/testutils/testcluster",
"//pkg/util",
"//pkg/util/ctxgroup",
"//pkg/util/leaktest",
"//pkg/util/log",
"//pkg/util/quotapool",
Expand Down
88 changes: 81 additions & 7 deletions pkg/ccl/multiregionccl/regionliveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,31 @@ import (
"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/ccl/multiregionccl/multiregionccltestutils"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql"
"github.com/cockroachdb/cockroach/pkg/sql/catalog"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descs"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/lease"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/systemschema"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/typedesc"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/regionliveness"
"github.com/cockroachdb/cockroach/pkg/sql/regions"
"github.com/cockroachdb/cockroach/pkg/sql/sqlinstance/instancestorage"
"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slbase"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)
Expand Down Expand Up @@ -196,7 +203,7 @@ func TestRegionLivenessProber(t *testing.T) {
return errors.AssertionFailedf("removed region detected %s", expectedRegions[1])
}
// Similarly query the unavailable physcial regions
unavailablePhysicalRegions, err := regionProber.QueryUnavailablePhysicalRegions(ctx, txn)
unavailablePhysicalRegions, err := regionProber.QueryUnavailablePhysicalRegions(ctx, txn, true /*filterAvailable*/)
if err != nil {
return err
}
Expand Down Expand Up @@ -259,16 +266,43 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
detectLeaseWait.Swap(false)
})()

var keyToBlockMu syncutil.Mutex
var keyToBlock roachpb.Key
recoveryBlock := make(chan struct{})
recoveryStart := make(chan struct{})
clusterKnobs := base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
TestingRequestFilter: func(ctx context.Context, request *kvpb.BatchRequest) *kvpb.Error {
// Slow any deletes to the regionliveness table, so that the recovery
// protocol on heartbeat never succeeds.
deleteRequest := request.Requests[0].GetDelete()
if deleteRequest == nil {
return nil
}
keyToBlockMu.Lock()
keyPrefix := keyToBlock
keyToBlockMu.Unlock()
if keyPrefix == nil || !deleteRequest.Key[:len(keyPrefix)].Equal(keyPrefix) {
return nil
}
recoveryStart <- struct{}{}
<-recoveryBlock
return nil
},
},
}
testCluster, _, cleanup := multiregionccltestutils.TestingCreateMultiRegionClusterWithRegionList(t,
expectedRegions,
1,
base.TestingKnobs{},
clusterKnobs,
multiregionccltestutils.WithSettings(makeSettings()))
defer cleanup()

id, err := roachpb.MakeTenantID(11)
require.NoError(t, err)
for i, s := range testCluster.Servers {
var tenant serverutils.ApplicationLayerInterface
var tenantDB *gosql.DB
tenantArgs := base.TestTenantArgs{
Settings: makeSettings(),
TenantID: id,
Expand All @@ -285,13 +319,16 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
if targetCount.Add(1) != 1 {
return
}
keyToBlockMu.Lock()
keyToBlock = tenant.Codec().TablePrefix(uint32(systemschema.RegionLivenessTable.GetID()))
keyToBlockMu.Unlock()
time.Sleep(testingRegionLivenessProbeTimeout)
}
},
},
},
}
tenant, tenantDB := serverutils.StartTenant(t, s, tenantArgs)
tenant, tenantDB = serverutils.StartTenant(t, s, tenantArgs)
tenantSQL = append(tenantSQL, tenantDB)
tenants = append(tenants, tenant)
// Before the other tenants are added we need to configure the system database,
Expand All @@ -313,6 +350,8 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
// Create a new table and have it used on all nodes.
_, err = tenantSQL[0].Exec("CREATE TABLE t1(j int)")
require.NoError(t, err)
_, err = tenantSQL[0].Exec("CREATE TABLE t2(j int)")
require.NoError(t, err)
for _, c := range tenantSQL {
_, err = c.Exec("SELECT * FROM t1")
require.NoError(t, err)
Expand Down Expand Up @@ -347,11 +386,8 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
}
return nil
})

require.NoError(t, tx.Rollback())

// Validate we can have a "dropped" region and the query won't fail.
lm := tenants[0].LeaseManager().(*lease.Manager)
lm := tenants[1].LeaseManager().(*lease.Manager)
cachedDatabaseRegions, err := regions.NewCachedDatabaseRegions(ctx, tenants[0].DB(), lm)
require.NoError(t, err)
regions.TestingModifyRegionEnum(cachedDatabaseRegions, func(descriptor catalog.TypeDescriptor) catalog.TypeDescriptor {
Expand All @@ -368,4 +404,42 @@ func TestRegionLivenessProberForLeases(t *testing.T) {
regionliveness.RegionLivenessProbeTimeout.Override(ctx, &ts.ClusterSettings().SV, testingRegionLivenessProbeTimeoutLong)
}
require.NoError(t, lm.WaitForNoVersion(ctx, descpb.ID(tableID), cachedDatabaseRegions, retry.Options{}))
grp := ctxgroup.WithContext(ctx)
grp.GoCtx(func(ctx context.Context) error {
_, err = tx.Exec("INSERT INTO t2 VALUES(5)")
return err
})
// Add a new region which will execute a recovery and clean up dead rows.
keyToBlockMu.Lock()
keyToBlock = nil
keyToBlockMu.Unlock()
tenantArgs := base.TestTenantArgs{
Settings: makeSettings(),
TenantID: id,
Locality: testCluster.Servers[0].Locality(),
}
_, newRegionSQL := serverutils.StartTenant(t, testCluster.Servers[0], tenantArgs)
tr := sqlutils.MakeSQLRunner(newRegionSQL)
// Validate everything was cleaned bringing up a new node in the down region.
require.Equalf(t,
tr.QueryStr(t, "SELECT * FROM system.region_liveness"),
[][]string{},
"expected no unavaialble regions.")
require.Equalf(t,
tr.QueryStr(t, "SELECT count(*) FROM system.sql_instances WHERE session_id IS NOT NULL"),
[][]string{{"3"}},
"extra sql instances are being used.")
require.Equalf(t,
tr.QueryStr(t, "SELECT count(*) FROM system.sqlliveness"),
[][]string{{"3"}},
"extra sql sessions detected.")
require.NoError(t, err)
// Validate that the stuck query will fail once we recover.
<-recoveryStart
time.Sleep(slbase.DefaultTTL.Default())
recoveryBlock <- struct{}{}
require.NoError(t, grp.Wait())
_, err = tx.Exec("INSERT INTO t2 VALUES(5)")
require.ErrorContainsf(t, grp.Wait(), "context canceled", "connection should have been dropped, node is dead.")
require.ErrorContainsf(t, tx.Commit(), "TransactionRetryWithProtoRefreshError: TransactionRetryError: retry txn", "connection should have been dropped, node is dead.")
}
42 changes: 36 additions & 6 deletions pkg/cmd/roachtest/tests/acceptance.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@ package tests

import (
"context"
"strings"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/errors"
)

func registerAcceptance(r registry.Registry) {
Expand All @@ -25,9 +28,12 @@ func registerAcceptance(r registry.Registry) {
fn func(ctx context.Context, t test.Test, c cluster.Cluster)
skip string
numNodes int
nodeRegions []string
timeout time.Duration
encryptionSupport registry.EncryptionSupport
defaultLeases bool
requiresLicense bool
disallowLocal bool
}{
registry.OwnerKV: {
{name: "decommission-self", fn: runDecommissionSelf},
Expand All @@ -53,6 +59,16 @@ func registerAcceptance(r registry.Registry) {
name: "multitenant",
fn: runAcceptanceMultitenant,
},
{
name: "multitenant-multiregion",
fn: runAcceptanceMultitenantMultiRegion,
numNodes: 9,
nodeRegions: []string{"us-west1-b", "us-west1-b", "us-west1-b",
"us-west1-b", "us-west1-b", "us-west1-b",
"us-east1-b", "us-east1-b", "us-east1-b"},
requiresLicense: true,
disallowLocal: true,
},
},
registry.OwnerObsInf: {
{name: "status-server", fn: runStatusServer},
Expand Down Expand Up @@ -99,27 +115,41 @@ func registerAcceptance(r registry.Registry) {
numNodes = tc.numNodes
}

spec := registry.TestSpec{
var extraOptions []spec.Option
if tc.nodeRegions != nil {
// Sanity: Ensure the region counts are sane.
if len(tc.nodeRegions) != numNodes {
panic(errors.AssertionFailedf("region list doesn't match number of nodes"))
}
extraOptions = append(extraOptions, spec.Geo())
extraOptions = append(extraOptions, spec.GCEZones(strings.Join(tc.nodeRegions, ",")))
}

testSpec := registry.TestSpec{
Name: "acceptance/" + tc.name,
Owner: owner,
Cluster: r.MakeClusterSpec(numNodes),
Cluster: r.MakeClusterSpec(numNodes, extraOptions...),
Skip: tc.skip,
EncryptionSupport: tc.encryptionSupport,
Timeout: 10 * time.Minute,
CompatibleClouds: registry.AllExceptAWS,
Suites: registry.Suites(registry.Nightly, registry.Quick),
RequiresLicense: tc.requiresLicense,
}

if tc.timeout != 0 {
spec.Timeout = tc.timeout
testSpec.Timeout = tc.timeout
}
if !tc.defaultLeases {
spec.Leases = registry.MetamorphicLeases
testSpec.Leases = registry.MetamorphicLeases
}
if tc.disallowLocal {
testSpec.CompatibleClouds = testSpec.CompatibleClouds.NoLocal()
}
spec.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
testSpec.Run = func(ctx context.Context, t test.Test, c cluster.Cluster) {
tc.fn(ctx, t, c)
}
r.Add(spec)
r.Add(testSpec)
}
}
}

0 comments on commit 7b66dc4

Please sign in to comment.