Skip to content

Commit

Permalink
allocatorimpl: fix replace constraints check function
Browse files Browse the repository at this point in the history
Previously, it was possible for replica replacement to not return a
valid target when some constraint was initially undersatisfied in
addition to the replica being replaced satisfying a separate constraint.

When this occurred, it could stall decommissioning a node, as replicas
become stuck with the allocator returning no valid replacement target.

Update the `replaceConstraintsCheck` function to correctly consider
whether the replacement store satisfies the constraint when computing if
the replacement is necessary.

Fixes: #117886
Part of: #117891
Release note (bug fix): Decommissioning replicas which are part of a
mis-replicated range will no longer get stuck on a rebalance operation
that was falsely determined to be unsafe. This bug was introduced in
23.1.0.
  • Loading branch information
kvoli committed Jan 24, 2024
1 parent 0f13536 commit 3d731c5
Show file tree
Hide file tree
Showing 4 changed files with 254 additions and 42 deletions.
51 changes: 20 additions & 31 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer.go
Original file line number Diff line number Diff line change
Expand Up @@ -1116,20 +1116,9 @@ func rankedCandidateListForAllocation(
continue
}

constraintsOK, necessary := constraintsCheck(s)
if !constraintsOK {
if necessary {
log.KvDistribution.VEventf(
ctx,
3,
"cannot allocate necessary %s on s%d",
targetType,
s.StoreID,
)
}
continue
if constraintsOK, _ := constraintsCheck(s); constraintsOK {
validCandidateStores = append(validCandidateStores, s)
}
validCandidateStores = append(validCandidateStores, s)
}

// Create a new store list, which will update the average for each stat to
Expand Down Expand Up @@ -2074,7 +2063,8 @@ func allocateConstraintsCheck(
// that is not already overly satisfied by existing replicas (other than the
// replacement), then it's necessary. If there are any necessary constraints
// that are not satisfied by the candidate when the existing store did satisfy
// that constraint, then the candidate is considered invalid entirely.
// that constraint, then the candidate is considered invalid and unnecessary
// entirely.
func replaceConstraintsCheck(
store, existingStore roachpb.StoreDescriptor, analyzed constraint.AnalyzedConstraints,
) (valid bool, necessary bool) {
Expand All @@ -2089,23 +2079,22 @@ func replaceConstraintsCheck(
satisfiedByCandidateStore := constraint.CheckStoreConjunction(store, constraints.Constraints)
if satisfiedByCandidateStore {
valid = true
}

// If the constraint is not already satisfied, it's necessary.
// Additionally, if the constraint is only just satisfied by the existing
// store being replaced, since that store is going away, the constraint is
// also marked as necessary.
if len(matchingStores) < int(constraints.NumReplicas) ||
(len(matchingStores) == int(constraints.NumReplicas) &&
satisfiedByExistingStore) {
necessary = true
}

// Check if existing store matches a constraint that isn't overly satisfied.
// If so, then only replacing it with a satisfying store is valid to ensure
// that the constraint stays fully satisfied.
if necessary && satisfiedByExistingStore && !satisfiedByCandidateStore {
return false, necessary
// If the constraint is not already satisfied, it's necessary.
// Additionally, if the constraint is only just satisfied by the existing
// store being replaced, since that store is going away, the constraint is
// also marked as necessary.
if len(matchingStores) < int(constraints.NumReplicas) ||
(len(matchingStores) == int(constraints.NumReplicas) &&
satisfiedByExistingStore) {
necessary = true
}
} else if satisfiedByExistingStore {
// Check if existing store matches a constraint that isn't overly satisfied.
// If so, then only replacing it with a satisfying store is valid to ensure
// that the constraint stays fully satisfied.
if len(matchingStores) <= int(constraints.NumReplicas) {
return false, false
}
}
}

Expand Down
233 changes: 233 additions & 0 deletions pkg/kv/kvserver/allocator/allocatorimpl/allocator_scorer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1089,7 +1089,240 @@ func TestRemoveConstraintsCheck(t *testing.T) {
}
})
}
}

func TestReplaceConstraintsCheck(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

testCases := []struct {
name string
constraints []roachpb.ConstraintsConjunction
numReplicas int32
existing []roachpb.StoreID
replacingStore roachpb.StoreID
expectedValid map[roachpb.StoreID]bool
expectedNecessary map[roachpb.StoreID]bool
}{
{
name: "required constraint",
constraints: []roachpb.ConstraintsConjunction{
{
Constraints: []roachpb.Constraint{
{Value: "a", Type: roachpb.Constraint_REQUIRED},
},
},
},
existing: []roachpb.StoreID{testStoreUSa15},
replacingStore: testStoreUSa15,
expectedValid: map[roachpb.StoreID]bool{
testStoreUSa15Dupe: true,
testStoreUSa1: true,
testStoreUSb: true,
testStoreEurope: false,
},
expectedNecessary: map[roachpb.StoreID]bool{
// NB: No stores are considered necessary replacements as the
// num_replicas is not specified for the constraint conjunction.
testStoreUSa15Dupe: false,
testStoreUSa1: false,
testStoreUSb: false,
testStoreEurope: false,
},
},
{
name: "prohibited constraint",
constraints: []roachpb.ConstraintsConjunction{
{
Constraints: []roachpb.Constraint{
{Value: "b", Type: roachpb.Constraint_PROHIBITED},
},
},
},
existing: []roachpb.StoreID{testStoreUSa15},
replacingStore: testStoreUSa15,
expectedValid: map[roachpb.StoreID]bool{
testStoreUSa15Dupe: true,
testStoreUSa1: false,
testStoreUSb: false,
testStoreEurope: true,
},
expectedNecessary: map[roachpb.StoreID]bool{
// NB: No stores are considered necessary replacements as the
// num_replicas is not specified for the constraint conjunction.
testStoreUSa15Dupe: false,
testStoreUSa1: false,
testStoreUSb: false,
testStoreEurope: false,
},
},
{
name: "single per-replica constraint existing=num_replicas",
constraints: []roachpb.ConstraintsConjunction{
{
Constraints: []roachpb.Constraint{
{Value: "a", Type: roachpb.Constraint_REQUIRED},
},
NumReplicas: 1,
},
},
existing: []roachpb.StoreID{testStoreUSa15},
replacingStore: testStoreUSa15,
expectedValid: map[roachpb.StoreID]bool{
testStoreUSa15Dupe: true,
testStoreUSa1: true,
testStoreUSb: true,
testStoreEurope: false,
},
expectedNecessary: map[roachpb.StoreID]bool{
testStoreUSa15Dupe: true,
testStoreUSa1: true,
testStoreUSb: true,
testStoreEurope: false,
},
},
{
name: "single per-replica constraint existing < num_replicas",
constraints: []roachpb.ConstraintsConjunction{
{
Constraints: []roachpb.Constraint{
{Value: "c", Type: roachpb.Constraint_REQUIRED},
},
NumReplicas: 1,
},
},
existing: []roachpb.StoreID{testStoreEurope},
replacingStore: testStoreEurope,
expectedValid: map[roachpb.StoreID]bool{
testStoreUSa15: false,
testStoreUSa15Dupe: false,
testStoreUSa1: false,
testStoreUSb: true,
},
expectedNecessary: map[roachpb.StoreID]bool{
testStoreUSa15: false,
testStoreUSa15Dupe: false,
testStoreUSa1: false,
testStoreUSb: true,
},
},
{
name: "single per-replica constraint existing > num_replicas",
constraints: []roachpb.ConstraintsConjunction{
{
Constraints: []roachpb.Constraint{
{Value: "a", Type: roachpb.Constraint_REQUIRED},
},
NumReplicas: 1,
},
},
existing: []roachpb.StoreID{testStoreUSa15, testStoreUSa15Dupe},
replacingStore: testStoreUSa15,
expectedValid: map[roachpb.StoreID]bool{
testStoreUSa15Dupe: true,
testStoreUSa1: true,
testStoreUSb: true,
testStoreEurope: false,
},
expectedNecessary: map[roachpb.StoreID]bool{
// NB: The constraint is over-satisfied, no store should be considered
// necessary as a replacement for testStoreUSa15.
testStoreUSa15Dupe: false,
testStoreUSa1: false,
testStoreUSb: false,
testStoreEurope: false,
},
},
{
name: "multiple per-replica constraint existing < num_replicas",
constraints: []roachpb.ConstraintsConjunction{
{
Constraints: []roachpb.Constraint{
{Value: "a", Type: roachpb.Constraint_REQUIRED},
},
NumReplicas: 1,
},
{
Constraints: []roachpb.Constraint{
{Value: "b", Type: roachpb.Constraint_REQUIRED},
},
NumReplicas: 1,
},
},
numReplicas: 2,
// We are missing a replica which satisfies the "b" constraint here.
existing: []roachpb.StoreID{testStoreUSa15, testStoreUSa15Dupe},
replacingStore: testStoreUSa15,
expectedValid: map[roachpb.StoreID]bool{
testStoreUSa15Dupe: true,
testStoreUSa1: true,
testStoreUSb: true,
testStoreEurope: false,
},
expectedNecessary: map[roachpb.StoreID]bool{
testStoreUSa15Dupe: false,
testStoreUSa1: true,
testStoreUSb: true,
testStoreEurope: false,
},
},
{
name: "multiple per-replica constraint existing == num_replicas unconstrained",
constraints: []roachpb.ConstraintsConjunction{
{
Constraints: []roachpb.Constraint{
{Value: "a", Type: roachpb.Constraint_REQUIRED},
},
NumReplicas: 1,
},
{
Constraints: []roachpb.Constraint{
{Value: "b", Type: roachpb.Constraint_REQUIRED},
},
NumReplicas: 1,
},
},
// One replica is unconstrained (sum(constraint_num_replicas) !=
// num_replicas).
numReplicas: 3,
existing: []roachpb.StoreID{testStoreUSa1},
replacingStore: testStoreUSa1,
expectedValid: map[roachpb.StoreID]bool{
testStoreUSa15: false,
testStoreUSa15Dupe: false,
testStoreUSb: true,
testStoreEurope: false,
},
expectedNecessary: map[roachpb.StoreID]bool{
testStoreUSa15: false,
testStoreUSa15Dupe: false,
testStoreUSb: true,
testStoreEurope: false,
},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
conf := roachpb.SpanConfig{
Constraints: tc.constraints,
NumReplicas: tc.numReplicas,
}
analyzed := constraint.AnalyzeConstraints(mockStoreResolver{}, testStoreReplicas(tc.existing), conf.NumReplicas, conf.Constraints)
for storeID, s := range testStores {
if storeID == tc.replacingStore {
continue
}
t.Run(fmt.Sprintf("%s/candidate=s%d,replacing=s%d", tc.name, s.StoreID, tc.replacingStore), func(t *testing.T) {
valid, necessary := replaceConstraintsCheck(s, testStores[tc.replacingStore], analyzed)
require.Equal(t, tc.expectedValid[s.StoreID], valid,
"mismatch replaceConstraintsCheck(s%d,s%d).valid", s.StoreID, tc.replacingStore)
require.Equal(t, tc.expectedNecessary[s.StoreID], necessary,
"mismatch replaceConstraintsCheck(s%d,s%d).necessary", s.StoreID, tc.replacingStore)
})
}
})
}
}

func TestShouldRebalanceDiversity(t *testing.T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,7 @@ assertion type=conformance under=0 over=0 unavailable=0 violating=0

eval duration=20m
----
failed assertion sample 1
conformance unavailable=0 under=0 over=0 violating=0
actual unavailable=0 under=0, over=0 violating=5 lease-violating=0 lease-less-preferred=0
violating constraints:
r1:000000{0000-2000} [(n1,s1):1, (n4,s4):4, (n3,s3):3] applying constraints=[+region=a:1 +region=b:1 +region=c:1]
r2:000000{2000-4000} [(n4,s4):1, (n1,s1):2, (n3,s3):4] applying constraints=[+region=a:1 +region=b:1 +region=c:1]
r3:000000{4000-6000} [(n4,s4):1, (n3,s3):2, (n1,s1):3] applying constraints=[+region=a:1 +region=b:1 +region=c:1]
r4:000000{6000-8000} [(n4,s4):1, (n3,s3):2, (n1,s1):4] applying constraints=[+region=a:1 +region=b:1 +region=c:1]
r5:00000{08000-10000} [(n4,s4):1, (n3,s3):2, (n1,s1):4] applying constraints=[+region=a:1 +region=b:1 +region=c:1]
OK

topology
----
Expand Down
2 changes: 0 additions & 2 deletions pkg/kv/kvserver/store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3794,7 +3794,6 @@ func TestAllocatorCheckRange(t *testing.T) {
expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter,
expectAllocatorErr: true,
expectedErrStr: "replicas must match constraints",
expectedLogMessage: "cannot allocate necessary voter on s3",
},
{
name: "decommissioning without satisfying multiple partial constraints",
Expand Down Expand Up @@ -3834,7 +3833,6 @@ func TestAllocatorCheckRange(t *testing.T) {
expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter,
expectAllocatorErr: true,
expectedErrStr: "replicas must match constraints",
expectedLogMessage: "cannot allocate necessary voter on s3",
},
{
name: "decommissioning during upreplication with partial constraints",
Expand Down

0 comments on commit 3d731c5

Please sign in to comment.