Skip to content

Commit

Permalink
upgrade: use high priority txn's to update the cluster version
Browse files Browse the repository at this point in the history
Previously, it was possible for the leasing subsystem to starve out
attempts to set the cluster version during upgrades, since the leasing
subsystem uses high priority txn for renewals. To address this, this
patch makes the logic to set the cluster version high priority so
it can't be pushed out by lease renewals.

Fixes: cockroachdb#113908

Release note (bug fix): Addressed a bug that could cause cluster version
finalization to get starved out by descriptor lease renewals on larger
clusters.
  • Loading branch information
fqazi committed Nov 9, 2023
1 parent 8fb7b97 commit a38c253
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 0 deletions.
19 changes: 19 additions & 0 deletions pkg/sql/set_cluster_setting.go
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,25 @@ func setVersionSetting(
return err
}
return db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
// Run the version bump inside the upgrade as high priority, since
// lease manager ends up reading the version row (with high priority)
// inside the settings table when refreshing leases. On larger clusters
// or with a large number of descriptors its possible for normal
// transactions to be starved (#95227).
// This is safe because we expected this transaction only do the following:
// 1) We expect this transaction to only read and write to the
// version key in the system.settings table.
// 2) Reads from the system.sql_instances table to confirm all SQL servers
// have been upgraded in multi-tenant environments.
// 3) Other transactions will use a normal priority and get pushed out by
// this one.
// 4) There is a potential danger to conflict with lease renewal of the
// settings table and the upgrade, but this cannot occur in practice,
// because the lease will always be acquired before we try to write to
// the settings table.
if err := txn.KV().SetUserPriority(roachpb.MaxUserPriority); err != nil {
return err
}
// Confirm if the version has actually changed on us.
datums, err := txn.QueryRowEx(
ctx, "retrieve-prev-setting", txn.KV(),
Expand Down
1 change: 1 addition & 0 deletions pkg/upgrade/upgrades/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ go_test(
"system_exec_insights_test.go",
"system_rbr_indexes_test.go",
"upgrades_test.go",
"version_starvation_test.go",
],
data = glob(["testdata/**"]),
embed = [":upgrades"],
Expand Down
115 changes: 115 additions & 0 deletions pkg/upgrade/upgrades/version_starvation_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package upgrades_test

import (
"context"
"sync/atomic"
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server"
clustersettings "github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/lease"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgradebase"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgrades"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/stretchr/testify/require"
)

// TestLeasingClusterVersionStarvation validates that setting
// the cluster version is done with a high priority txn and cannot
// be pushed out. Previously, this would be normal priority and
// get pushed by the leasing by the leasing code, leading to starvation
// when leases were acquired with sufficiently high frequency.
// Note: This test just confirms its not normal priority by checking
// if it can push other txns.
func TestLeasingClusterVersionStarvation(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()

routineChan := make(chan error)
startRoutineOnce := atomic.Bool{}
var bgRoutine func()
clusterArgs := base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
Knobs: base.TestingKnobs{
UpgradeManager: &upgradebase.TestingKnobs{
ListBetweenOverride: func(from, to roachpb.Version) []roachpb.Version {
// Using this to detect when the upgrade actually starts.
if startRoutineOnce.Swap(false) {
go bgRoutine()
}
return clusterversion.ListBetween(from, to)
},
},
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
BinaryVersionOverride: clusterversion.ByKey(
clusterversion.V23_2),
},
},
},
}

// Disable lease renewals intentionally, so that we validate
// no deadlock risk exists with the settings table.
st := clustersettings.MakeTestingClusterSettingsWithVersions(
clusterversion.TestingBinaryVersion,
clusterversion.TestingBinaryMinSupportedVersion,
false)
lease.LeaseDuration.Override(ctx, &st.SV, 0)
lease.LeaseRenewalDuration.Override(ctx, &st.SV, 0)
clusterArgs.ServerArgs.Settings = st

tc := testcluster.StartTestCluster(t, 1, clusterArgs)

defer tc.Stopper().Stop(ctx)
db := tc.ServerConn(0)
defer db.Close()

proceedWithCommit := make(chan struct{})
// Start a background transaction that will have an intent
// on the version key inside the settings table, with a
// normal priority (which should get pushed by the upgrade).
bgRoutine = func() {
tx, err := db.Begin()
if err != nil {
routineChan <- err
return
}
_, err = tx.Exec("SELECT name from system.settings where name='version' FOR UPDATE")
if err != nil {
routineChan <- err
return
}
<-proceedWithCommit
routineChan <- tx.Commit()
}
startRoutineOnce.Swap(true)

upgrades.Upgrade(
t,
db,
clusterversion.V24_1,
nil,
false,
)

// Our txn should have been pushed by the upgrade,
// which has a higher txn priority.
close(proceedWithCommit)
require.ErrorContainsf(t, <-routineChan, "pq: restart transaction: TransactionRetryWithProtoRefreshError:",
"upgrade was not able to push transaction")
}

0 comments on commit a38c253

Please sign in to comment.