Skip to content

Commit

Permalink
pkg/server: support tenant auto-upgrade
Browse files Browse the repository at this point in the history
Previously, tenant upgrades in UA required a user to issue a `SET CLUSTER SETTING version =`
statement to finalize an upgrade. This UX is different from what we have in single-tenant
SH/Dedicated deployments in that we have auto upgrade in the later that starts an attempt
to finalize cluster version after every node startup incase the node was started with a new
binary version that all nodes now support upgrading to.

In UA, we have two differences:

1. What to upgrade?
    - In a multi-tenant deployment, the storage and sql layers are upgraded separately.
    - The storage layer upgrade finalization is still handled by the existing auto upgrade logic.
    - In this change, we ensure that the sql layer is also auto-upgraded when possible.
2. When to upgrade?
    - In a single-tenant deployment, all layers share the same binary version and cluster version.
      Hence, an upgrade attempt is only needed when a new node starts to ensure that the cluster is
      auto-upgraded if the new binary version supports an upgrade.
    - In a multi-tenant deployment, in addition to the condition above, the sql server upgrade is
      also constrained by the storage cluster version. It is possible for all SQL instances to have
      binary versions that support an upgrade but the upgrade will still be blocked by the storage
      cluster version if it’s equal to the current tenant cluster version.

This code change does the following:

1. Adds logic to run a SQL server upgrade attempt (mostly adopted from the original auto upgrade code)
   within the following ordered constraints (previously we merged #98830 to make getting the binary
   versions of instances easier):
    - Ensure that the upgrade is not blocked by the secondary tenant's setting of preserve_downgrade_option
      or an all-tenant override of that value.
    - Exit if tenant cluster version is equal to the minimum instance binary version [upgrade already completed].
    - Upgrade to storage cluster version if the binary version of all SQL instances supports that.
    - Exit if storage cluster version is less than the minimum instance binary version [upgrade blocked
      due to low storage cluster version].
    - Upgrade to the minimum instance binary version.

2. Runs the logic above when a SQL server is started.
    - This covers the case where a SQL server binary upgrade allows for an upgrade to the
      tenant cluster version.

3. Checks for change in storage cluster version every 10 seconds and starts an upgrade attempt if
   it was changed.
    - This covers the case where the binary versions of all SQL instances allow for an upgrade
      but it’s blocked due to the storage cluster version.

Release note: None
Epic: CRDB-20860
  • Loading branch information
healthy-pod committed Oct 6, 2023
1 parent 43e9e39 commit 7b8cb86
Show file tree
Hide file tree
Showing 9 changed files with 516 additions and 19 deletions.
4 changes: 4 additions & 0 deletions pkg/ccl/kvccl/kvtenantccl/upgradeccl/BUILD.bazel
Expand Up @@ -7,6 +7,7 @@ go_test(
"tenant_upgrade_test.go",
],
args = ["-test.timeout=295s"],
shard_count = 4,
tags = ["ccl_test"],
deps = [
"//pkg/base",
Expand All @@ -19,12 +20,15 @@ go_test(
"//pkg/server",
"//pkg/settings/cluster",
"//pkg/spanconfig",
"//pkg/sql/sem/eval",
"//pkg/sql/sqlinstance/instancestorage",
"//pkg/sql/sqlliveness/slinstance",
"//pkg/testutils/serverutils",
"//pkg/testutils/skip",
"//pkg/testutils/sqlutils",
"//pkg/upgrade",
"//pkg/upgrade/upgradebase",
"//pkg/util",
"//pkg/util/leaktest",
"//pkg/util/log",
"//pkg/util/randutil",
Expand Down
227 changes: 227 additions & 0 deletions pkg/ccl/kvccl/kvtenantccl/upgradeccl/tenant_upgrade_test.go
Expand Up @@ -21,18 +21,233 @@ import (
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/spanconfig"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sqlinstance/instancestorage"
"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slinstance"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/upgrade"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgradebase"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/stretchr/testify/require"
)

func TestTenantAutoUpgradePreservesDowngrade(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderStressRace(t)

// v0 is hard-coded because at clusterversion.TestingBinaryMinSupportedVersion is `v22.2` at the
// time of typing and it does not support shared process tenants. We should update v0 to be
// clusterversion.TestingBinaryMinSupportedVersion when it is bumped to `v23.1`.
v0 := clusterversion.V23_1
ctx := context.Background()
settings := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(v0),
false, // initializeVersion
)
// Initialize the version to v0.
require.NoError(t, clusterversion.Initialize(ctx,
clusterversion.ByKey(v0), &settings.SV))

ts := serverutils.StartServerOnly(t, base.TestServerArgs{
DefaultTestTenant: base.TestControlsTenantsExplicitly,
Settings: settings,
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
BinaryVersionOverride: clusterversion.ByKey(v0),
BootstrapVersionKeyOverride: v0,
},
SQLEvalContext: &eval.TestingKnobs{
// When the host binary version is not equal to its cluster version, tenant logical version is set
// to the host's minimum supported binary version. We need this override to ensure that the tenant is
// created at v0.
TenantLogicalVersionKeyOverride: v0,
},
},
})
defer ts.Stopper().Stop(ctx)
sysDB := sqlutils.MakeSQLRunner(ts.SQLConn(t, ""))

expectedInitialTenantVersion := clusterversion.ByKey(v0)

tenantSettings := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(v0),
false, // initializeVersion
)
require.NoError(t, clusterversion.Initialize(ctx,
expectedInitialTenantVersion, &tenantSettings.SV))

upgradeInfoCh := make(chan struct {
Status int
UpgradeTo roachpb.Version
}, 1)
mkTenant := func(t *testing.T, name string) (tenantDB *gosql.DB) {
tenantArgs := base.TestSharedProcessTenantArgs{
TenantName: roachpb.TenantName(name),
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
TenantAutoUpgradeInfo: upgradeInfoCh,
BootstrapVersionKeyOverride: v0,
BinaryVersionOverride: clusterversion.ByKey(v0),
},
},
}
_, tenantDB, err := ts.TenantController().StartSharedProcessTenant(ctx, tenantArgs)
require.NoError(t, err)
return tenantDB
}

// Create a shared process tenant and its SQL server.
const tenantName = "marhaba-crdb"
tenantDB := mkTenant(t, tenantName)
tenantRunner := sqlutils.MakeSQLRunner(tenantDB)

// Ensure that the tenant works.
tenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)")
tenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)")

// Set cluster.preserve_downgrade_option setting for the tenant to prevent auto upgrade.
tenantRunner.Exec(t, "SET CLUSTER SETTING cluster.preserve_downgrade_option = $1", clusterversion.ByKey(v0).String())

// Upgrade the host cluster.
sysDB.Exec(t,
"SET CLUSTER SETTING version = $1",
clusterversion.TestingBinaryVersion.String())

// Ensure that the tenant still works.
tenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}})

// Wait for auto upgrade status to be received by the testing knob.
succeedsSoon := 20 * time.Second
for {
select {
case upgradeInfo := <-upgradeInfoCh:
if int(server.UpgradeDisabledByConfiguration) == upgradeInfo.Status {
return
}
case <-time.After(succeedsSoon):
t.Fatalf("failed to receive the right auto upgrade status after %d seconds", int(succeedsSoon.Seconds()))
}
}
}

func TestTenantAutoUpgrade(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderStressRace(t)

// v0 is hard-coded because at clusterversion.TestingBinaryMinSupportedVersion is `v22.2` at the
// time of typing and it does not support shared process tenants. We should update v0 to be
// clusterversion.TestingBinaryMinSupportedVersion when it is bumped to `v23.1`.
v0 := clusterversion.V23_1
ctx := context.Background()
settings := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(v0),
false, // initializeVersion
)
// Initialize the version to v0.
require.NoError(t, clusterversion.Initialize(ctx,
clusterversion.ByKey(v0), &settings.SV))

ts := serverutils.StartServerOnly(t, base.TestServerArgs{
DefaultTestTenant: base.TestControlsTenantsExplicitly,
Settings: settings,
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
BinaryVersionOverride: clusterversion.ByKey(v0),
BootstrapVersionKeyOverride: v0,
},
SQLEvalContext: &eval.TestingKnobs{
// When the host binary version is not equal to its cluster version, tenant logical version is set
// to the host's minimum supported binary version. We need this override to ensure that the tenant is
// created at v0.
TenantLogicalVersionKeyOverride: v0,
},
},
})
defer ts.Stopper().Stop(ctx)
sysDB := sqlutils.MakeSQLRunner(ts.SQLConn(t, ""))

expectedInitialTenantVersion := clusterversion.ByKey(v0)
expectedFinalTenantVersion := clusterversion.TestingBinaryVersion

tenantSettings := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(v0),
false, // initializeVersion
)
require.NoError(t, clusterversion.Initialize(ctx,
expectedInitialTenantVersion, &tenantSettings.SV))

upgradeInfoCh := make(chan struct {
Status int
UpgradeTo roachpb.Version
}, 1)
mkTenant := func(t *testing.T, name string) (tenantDB *gosql.DB) {
tenantArgs := base.TestSharedProcessTenantArgs{
TenantName: roachpb.TenantName(name),
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
TenantAutoUpgradeInfo: upgradeInfoCh,
AllowTenantAutoUpgradeOnInternalVersionChanges: true,
BootstrapVersionKeyOverride: v0,
BinaryVersionOverride: clusterversion.ByKey(v0),
},
},
}
_, tenantDB, err := ts.TenantController().StartSharedProcessTenant(ctx, tenantArgs)
require.NoError(t, err)
return tenantDB
}

// Create a shared process tenant and its SQL server.
const tenantName = "hola-crdb"
tenantDB := mkTenant(t, tenantName)
tenantRunner := sqlutils.MakeSQLRunner(tenantDB)

// Ensure that the tenant works.
tenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)")
tenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)")

// Upgrade the host cluster.
sysDB.Exec(t,
"SET CLUSTER SETTING version = $1",
expectedFinalTenantVersion.String())

// Ensure that the tenant still works.
tenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}})

var upgradeInfo struct {
Status int
UpgradeTo roachpb.Version
}
succeedsSoon := 20 * time.Second
if util.RaceEnabled {
succeedsSoon = 60 * time.Second
}
// Wait for auto upgrade status to be received by the testing knob.
for {
select {
case upgradeInfo = <-upgradeInfoCh:
if upgradeInfo.UpgradeTo == expectedFinalTenantVersion && upgradeInfo.Status == int(server.UpgradeAllowed) {
return
}
case <-time.After(succeedsSoon):
t.Fatalf("failed to receive the right auto upgrade status after %d seconds", int(succeedsSoon.Seconds()))
}
}
}

// TestTenantUpgrade exercises the case where a system tenant is in a
// non-finalized version state and creates a tenant. The test ensures
// that the newly created tenant begins in that same version.
Expand All @@ -48,6 +263,7 @@ import (
func TestTenantUpgrade(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderStressRace(t)
ctx := context.Background()

v1 := clusterversion.TestingBinaryMinSupportedVersion
Expand Down Expand Up @@ -93,6 +309,9 @@ func TestTenantUpgrade(t *testing.T) {
TestingKnobs: base.TestingKnobs{
// Make the upgrade faster by accelerating jobs.
JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(),
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
},
Settings: settings,
}
Expand Down Expand Up @@ -129,6 +348,11 @@ func TestTenantUpgrade(t *testing.T) {
t.Log("restart the tenant")
tenantServer.AppStopper().Stop(ctx)
tenantServer, err := ts.TenantController().StartTenant(ctx, base.TestTenantArgs{
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
},
TenantID: roachpb.MustMakeTenantID(initialTenantID),
})
require.NoError(t, err)
Expand Down Expand Up @@ -246,6 +470,9 @@ func TestTenantUpgradeFailure(t *testing.T) {
SpanConfig: &spanconfig.TestingKnobs{
ManagerDisableJobCreation: true,
},
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
UpgradeManager: &upgradebase.TestingKnobs{
DontUseJobs: true,
RegistryOverride: func(v roachpb.Version) (upgradebase.Upgrade, bool) {
Expand Down
Expand Up @@ -150,6 +150,9 @@ func runTest(t *testing.T, variant sharedtestutil.TestVariant, test sharedtestut
tenantArgs := base.TestTenantArgs{
TenantID: id,
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(),
UpgradeManager: &upgradebase.TestingKnobs{
InterlockPausePoint: test.PausePoint,
Expand Down Expand Up @@ -283,6 +286,11 @@ func runTest(t *testing.T, variant sharedtestutil.TestVariant, test sharedtestut
Stopper: otherServerStopper,
TenantID: tenantID,
Settings: otherServerSettings,
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
},
})

var otherTenantRunner *sqlutils.SQLRunner
Expand Down
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Expand Up @@ -75,6 +75,7 @@ go_library(
"stop_trigger.go",
"tcp_keepalive_manager.go",
"tenant.go",
"tenant_auto_upgrade.go",
"tenant_migration.go",
"testing_knobs.go",
"testserver.go",
Expand Down

0 comments on commit 7b8cb86

Please sign in to comment.