Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 28 additions & 23 deletions pkg/cmd/roachtest/tests/multi_store_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@ import (
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/retry"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
)

const (
Expand Down Expand Up @@ -68,6 +71,7 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
)

c.Start(ctx, t.L(), startOpts, startSettings, c.Range(1, 3))
require.NoError(t, roachtestutil.WaitFor3XReplication(ctx, t.L(), c.Conn(ctx, t.L(), 1)))

// Confirm that there are 6 stores live.
t.Status("store setup")
Expand Down Expand Up @@ -134,6 +138,9 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
if err := c.StartE(ctx, t.L(), startOpts, startSettings, node); err != nil {
t.Fatalf("restarting node: %s", err)
}
// Example: with 12 stores per node, since we removed n1's last one, we did remove
// s12.
removedStoreID := multiStoreStoresPerNode

// Wait for the store to be marked as dead.
t.Status("awaiting store death")
Expand All @@ -151,32 +158,30 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
t.Fatalf("awaiting store death: %s", err)
}

// Wait for up-replication.
// NOTE: At the time of writing, under-replicated ranges are computed using
// node liveness, rather than store liveness, so we instead compare the range
// count to the current per-store replica count to compute whether all there
// are still under-replicated ranges from the dead store.
// TODO(travers): Once #123561 is solved, re-work this.
t.Status("awaiting up-replication")
tStart := timeutil.Now()
var oldReplicas int
// Wait until no ranges remain on the removed store.
t.Status("awaiting range relocation")
sr := sqlutils.MakeSQLRunner(conn)
for {
var ranges, replicas int
if err := conn.QueryRowContext(ctx,
`SELECT
(SELECT count(1) FROM crdB_internal.ranges) AS ranges
, (SELECT sum(range_count) FROM crdb_internal.kv_store_status) AS replicas`,
).Scan(&ranges, &replicas); err != nil {
t.Fatalf("replication status: %s", err)
}
if replicas == 3*ranges {
t.L().Printf("up-replication complete")
res := sr.QueryStr(t, `
SELECT
(SELECT count(*) FROM crdb_internal.ranges_no_leases
WHERE $1::INT = ANY(replicas)) AS cnt,
ARRAY(
SELECT range_id
FROM crdb_internal.ranges_no_leases
WHERE $1::INT = ANY(replicas)
ORDER BY range_id
LIMIT 10) AS sample;
`, removedStoreID)
cnt := res[0][0]
sample := res[0][1]

if cnt == "0" {
t.L().Printf("all ranges moved off removed store")
break
}
if timeutil.Since(tStart) > 30*time.Second || oldReplicas != replicas {
t.L().Printf("still waiting for replication (%d / %d)", replicas, 3*ranges)
}
oldReplicas = replicas

t.L().Printf("%s ranges remain on removed store; for example: %s", cnt, sample)
time.Sleep(5 * time.Second)
}
t.Status("done")
Expand Down
Loading