Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pkg/server: support tenant auto-upgrade #102427

Merged
merged 1 commit into from Oct 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/generated/settings/settings-for-tenants.txt
Expand Up @@ -27,6 +27,7 @@ changefeed.sink_io_workers integer 0 the number of workers used by changefeeds w
cloudstorage.azure.concurrent_upload_buffers integer 1 controls the number of concurrent buffers that will be used by the Azure client when uploading chunks.Each buffer can buffer up to cloudstorage.write_chunk.size of memory during an upload application
cloudstorage.http.custom_ca string custom root CA (appended to system's default CAs) for verifying certificates when interacting with HTTPS storage application
cloudstorage.timeout duration 10m0s the timeout for import/export storage operations application
cluster.auto_upgrade.enabled boolean true disable automatic cluster version upgrade until reset application
cluster.organization string organization name system-visible
cluster.preserve_downgrade_option string disable (automatic or manual) cluster version upgrade from the specified version until reset application
diagnostics.forced_sql_stat_reset.interval duration 2h0m0s interval after which the reported SQL Stats are reset even if not collected by telemetry reporter. It has a max value of 24H. application
Expand Down
1 change: 1 addition & 0 deletions docs/generated/settings/settings.html
Expand Up @@ -32,6 +32,7 @@
<tr><td><div id="setting-cloudstorage-azure-concurrent-upload-buffers" class="anchored"><code>cloudstorage.azure.concurrent_upload_buffers</code></div></td><td>integer</td><td><code>1</code></td><td>controls the number of concurrent buffers that will be used by the Azure client when uploading chunks.Each buffer can buffer up to cloudstorage.write_chunk.size of memory during an upload</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-cloudstorage-http-custom-ca" class="anchored"><code>cloudstorage.http.custom_ca</code></div></td><td>string</td><td><code></code></td><td>custom root CA (appended to system&#39;s default CAs) for verifying certificates when interacting with HTTPS storage</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-cloudstorage-timeout" class="anchored"><code>cloudstorage.timeout</code></div></td><td>duration</td><td><code>10m0s</code></td><td>the timeout for import/export storage operations</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-cluster-auto-upgrade-enabled" class="anchored"><code>cluster.auto_upgrade.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>disable automatic cluster version upgrade until reset</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-cluster-organization" class="anchored"><code>cluster.organization</code></div></td><td>string</td><td><code></code></td><td>organization name</td><td>Serverless/Dedicated/Self-Hosted (read-only)</td></tr>
<tr><td><div id="setting-cluster-preserve-downgrade-option" class="anchored"><code>cluster.preserve_downgrade_option</code></div></td><td>string</td><td><code></code></td><td>disable (automatic or manual) cluster version upgrade from the specified version until reset</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-diagnostics-active-query-dumps-enabled" class="anchored"><code>diagnostics.active_query_dumps.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>experimental: enable dumping of anonymized active queries to disk when node is under memory pressure</td><td>Dedicated/Self-Hosted</td></tr>
Expand Down
4 changes: 4 additions & 0 deletions pkg/ccl/kvccl/kvtenantccl/upgradeccl/BUILD.bazel
Expand Up @@ -7,6 +7,7 @@ go_test(
"tenant_upgrade_test.go",
],
args = ["-test.timeout=295s"],
shard_count = 4,
tags = ["ccl_test"],
deps = [
"//pkg/base",
Expand All @@ -19,12 +20,15 @@ go_test(
"//pkg/server",
"//pkg/settings/cluster",
"//pkg/spanconfig",
"//pkg/sql/sem/eval",
"//pkg/sql/sqlinstance/instancestorage",
"//pkg/sql/sqlliveness/slinstance",
"//pkg/testutils/serverutils",
"//pkg/testutils/skip",
"//pkg/testutils/sqlutils",
"//pkg/upgrade",
"//pkg/upgrade/upgradebase",
"//pkg/util",
"//pkg/util/leaktest",
"//pkg/util/log",
"//pkg/util/randutil",
Expand Down
228 changes: 228 additions & 0 deletions pkg/ccl/kvccl/kvtenantccl/upgradeccl/tenant_upgrade_test.go
Expand Up @@ -11,6 +11,7 @@ package upgradeccl_test
import (
"context"
gosql "database/sql"
"fmt"
"testing"
"time"

Expand All @@ -21,18 +22,233 @@ import (
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/spanconfig"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/sql/sqlinstance/instancestorage"
"github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slinstance"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/upgrade"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgradebase"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/stop"
"github.com/stretchr/testify/require"
)

func TestTenantAutoUpgradeRespectsAutoUpgradeEnabledSetting(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderStressRace(t)

// v0 is hard-coded because at clusterversion.TestingBinaryMinSupportedVersion is `v22.2` at the
// time of typing and it does not support shared process tenants. We should update v0 to be
// clusterversion.TestingBinaryMinSupportedVersion when it is bumped to `v23.1`.
v0 := clusterversion.V23_1
ctx := context.Background()
settings := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(v0),
false, // initializeVersion
)
// Initialize the version to v0.
require.NoError(t, clusterversion.Initialize(ctx,
clusterversion.ByKey(v0), &settings.SV))

ts := serverutils.StartServerOnly(t, base.TestServerArgs{
DefaultTestTenant: base.TestControlsTenantsExplicitly,
Settings: settings,
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
BinaryVersionOverride: clusterversion.ByKey(v0),
BootstrapVersionKeyOverride: v0,
},
SQLEvalContext: &eval.TestingKnobs{
// When the host binary version is not equal to its cluster version, tenant logical version is set
// to the host's minimum supported binary version. We need this override to ensure that the tenant is
// created at v0.
TenantLogicalVersionKeyOverride: v0,
},
},
})
defer ts.Stopper().Stop(ctx)
sysDB := sqlutils.MakeSQLRunner(ts.SQLConn(t, ""))

expectedInitialTenantVersion := clusterversion.ByKey(v0)

tenantSettings := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(v0),
false, // initializeVersion
)
require.NoError(t, clusterversion.Initialize(ctx,
expectedInitialTenantVersion, &tenantSettings.SV))

upgradeInfoCh := make(chan struct {
Status int
UpgradeTo roachpb.Version
}, 1)
mkTenant := func(t *testing.T, name string) (tenantDB *gosql.DB) {
tenantArgs := base.TestSharedProcessTenantArgs{
TenantName: roachpb.TenantName(name),
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
TenantAutoUpgradeInfo: upgradeInfoCh,
BootstrapVersionKeyOverride: v0,
BinaryVersionOverride: clusterversion.ByKey(v0),
},
},
}
_, tenantDB, err := ts.TenantController().StartSharedProcessTenant(ctx, tenantArgs)
require.NoError(t, err)
return tenantDB
}

// Create a shared process tenant and its SQL server.
const tenantName = "marhaba-crdb"
tenantDB := mkTenant(t, tenantName)
tenantRunner := sqlutils.MakeSQLRunner(tenantDB)

// Ensure that the tenant works.
tenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)")
tenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)")

// Disable cluster.auto_upgrade.enabled setting for the tenant to prevent auto upgrade.
tenantRunner.Exec(t, fmt.Sprintf("SET CLUSTER SETTING %s = false", clusterversion.AutoUpgradeEnabled.Name()))

// Upgrade the host cluster.
sysDB.Exec(t,
"SET CLUSTER SETTING version = $1",
clusterversion.TestingBinaryVersion.String())

// Ensure that the tenant still works.
tenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}})

// Wait for auto upgrade status to be received by the testing knob.
succeedsSoon := 20 * time.Second
for {
select {
case upgradeInfo := <-upgradeInfoCh:
if int(server.UpgradeDisabledByConfiguration) == upgradeInfo.Status {
return
}
case <-time.After(succeedsSoon):
t.Fatalf("failed to receive the right auto upgrade status after %d seconds", int(succeedsSoon.Seconds()))
}
}
}

func TestTenantAutoUpgrade(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderStressRace(t)

// v0 is hard-coded because at clusterversion.TestingBinaryMinSupportedVersion is `v22.2` at the
// time of typing and it does not support shared process tenants. We should update v0 to be
// clusterversion.TestingBinaryMinSupportedVersion when it is bumped to `v23.1`.
v0 := clusterversion.V23_1
ctx := context.Background()
settings := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(v0),
false, // initializeVersion
)
// Initialize the version to v0.
require.NoError(t, clusterversion.Initialize(ctx,
clusterversion.ByKey(v0), &settings.SV))

ts := serverutils.StartServerOnly(t, base.TestServerArgs{
DefaultTestTenant: base.TestControlsTenantsExplicitly,
Settings: settings,
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
BinaryVersionOverride: clusterversion.ByKey(v0),
BootstrapVersionKeyOverride: v0,
},
SQLEvalContext: &eval.TestingKnobs{
// When the host binary version is not equal to its cluster version, tenant logical version is set
// to the host's minimum supported binary version. We need this override to ensure that the tenant is
// created at v0.
TenantLogicalVersionKeyOverride: v0,
},
},
})
defer ts.Stopper().Stop(ctx)
sysDB := sqlutils.MakeSQLRunner(ts.SQLConn(t, ""))

expectedInitialTenantVersion := clusterversion.ByKey(v0)
expectedFinalTenantVersion := clusterversion.TestingBinaryVersion

tenantSettings := cluster.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.ByKey(v0),
false, // initializeVersion
)
require.NoError(t, clusterversion.Initialize(ctx,
expectedInitialTenantVersion, &tenantSettings.SV))

upgradeInfoCh := make(chan struct {
Status int
UpgradeTo roachpb.Version
}, 1)
mkTenant := func(t *testing.T, name string) (tenantDB *gosql.DB) {
tenantArgs := base.TestSharedProcessTenantArgs{
TenantName: roachpb.TenantName(name),
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
TenantAutoUpgradeInfo: upgradeInfoCh,
AllowTenantAutoUpgradeOnInternalVersionChanges: true,
BootstrapVersionKeyOverride: v0,
BinaryVersionOverride: clusterversion.ByKey(v0),
},
},
}
_, tenantDB, err := ts.TenantController().StartSharedProcessTenant(ctx, tenantArgs)
require.NoError(t, err)
return tenantDB
}

// Create a shared process tenant and its SQL server.
const tenantName = "hola-crdb"
tenantDB := mkTenant(t, tenantName)
tenantRunner := sqlutils.MakeSQLRunner(tenantDB)

// Ensure that the tenant works.
tenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)")
tenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)")

// Upgrade the host cluster.
sysDB.Exec(t,
"SET CLUSTER SETTING version = $1",
expectedFinalTenantVersion.String())

// Ensure that the tenant still works.
tenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}})

var upgradeInfo struct {
Status int
UpgradeTo roachpb.Version
}
succeedsSoon := 20 * time.Second
if util.RaceEnabled {
succeedsSoon = 60 * time.Second
}
// Wait for auto upgrade status to be received by the testing knob.
for {
select {
case upgradeInfo = <-upgradeInfoCh:
if upgradeInfo.UpgradeTo == expectedFinalTenantVersion && upgradeInfo.Status == int(server.UpgradeAllowed) {
return
}
case <-time.After(succeedsSoon):
t.Fatalf("failed to receive the right auto upgrade status after %d seconds", int(succeedsSoon.Seconds()))
}
}
}

// TestTenantUpgrade exercises the case where a system tenant is in a
// non-finalized version state and creates a tenant. The test ensures
// that the newly created tenant begins in that same version.
Expand All @@ -48,6 +264,7 @@ import (
func TestTenantUpgrade(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderStressRace(t)
ctx := context.Background()

v1 := clusterversion.TestingBinaryMinSupportedVersion
Expand Down Expand Up @@ -93,6 +310,9 @@ func TestTenantUpgrade(t *testing.T) {
TestingKnobs: base.TestingKnobs{
// Make the upgrade faster by accelerating jobs.
JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(),
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
},
Settings: settings,
}
Expand Down Expand Up @@ -129,6 +349,11 @@ func TestTenantUpgrade(t *testing.T) {
t.Log("restart the tenant")
tenantServer.AppStopper().Stop(ctx)
tenantServer, err := ts.TenantController().StartTenant(ctx, base.TestTenantArgs{
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
},
TenantID: roachpb.MustMakeTenantID(initialTenantID),
})
require.NoError(t, err)
Expand Down Expand Up @@ -246,6 +471,9 @@ func TestTenantUpgradeFailure(t *testing.T) {
SpanConfig: &spanconfig.TestingKnobs{
ManagerDisableJobCreation: true,
},
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
UpgradeManager: &upgradebase.TestingKnobs{
DontUseJobs: true,
RegistryOverride: func(v roachpb.Version) (upgradebase.Upgrade, bool) {
Expand Down
Expand Up @@ -150,6 +150,9 @@ func runTest(t *testing.T, variant sharedtestutil.TestVariant, test sharedtestut
tenantArgs := base.TestTenantArgs{
TenantID: id,
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(),
UpgradeManager: &upgradebase.TestingKnobs{
InterlockPausePoint: test.PausePoint,
Expand Down Expand Up @@ -283,6 +286,11 @@ func runTest(t *testing.T, variant sharedtestutil.TestVariant, test sharedtestut
Stopper: otherServerStopper,
TenantID: tenantID,
Settings: otherServerSettings,
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
},
},
})

var otherTenantRunner *sqlutils.SQLRunner
Expand Down
10 changes: 10 additions & 0 deletions pkg/clusterversion/setting.go
Expand Up @@ -306,3 +306,13 @@ func MakeMetricsAndRegisterOnVersionChangeCallback(sv *settings.Values) Metrics
PreserveDowngradeLastUpdated: gauge,
}
}

// AutoUpgradeEnabled is used to enable and disable automatic upgrade.
var AutoUpgradeEnabled = settings.RegisterBoolSetting(
settings.ApplicationLevel,
"cluster.auto_upgrade.enabled",
"disable automatic cluster version upgrade until reset",
true,
settings.WithReportable(true),
settings.WithPublic,
)
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Expand Up @@ -76,6 +76,7 @@ go_library(
"stop_trigger.go",
"tcp_keepalive_manager.go",
"tenant.go",
"tenant_auto_upgrade.go",
"tenant_migration.go",
"testing_knobs.go",
"testserver.go",
Expand Down