Skip to content
Browse files
Bug Fix: Ensure that followers wait before deleting tablet (#4602)
In #4496, we introduced the concept of waiting for latest membership status, before the source Alpha proposes deletion of the tablet (/predicate). However, that fix only applied that check on the source Alpha group's leader. It should have been applied on each member of the Alpha group.

This PR fixes that by making the expected membership checksum part of the `CleanPredicate` proposal. That way, each Alpha can block until they see the latest membership update which says that they no longer serve that tablet, before deleting it. This fixes a bunch of `bank/move-tablet` failures, but we still see very rare `bank/move-tablet` violations.

Another fix this PR contains is to avoid doing posting list splits, which were causing `uid-set/move-tablet` failures, fixing #4538.

* Martin's Split code has a bug. Let's not split posting lists.
* Better messaging around turning off split
* Make ExpectedChecksum part of the CleanPredicate proposal, so all followers also block deletion of the predicate until they have seen the latest membership update.

(cherry picked from commit 2540a93)
  • Loading branch information
manishrjain authored and danielmai committed Jan 27, 2020
1 parent 79be69d commit d61eb6905ba45872629569e6d42416a2106cad68
Showing 6 changed files with 324 additions and 281 deletions.
@@ -270,6 +270,7 @@ message Proposal {
OracleDelta delta = 8;
Snapshot snapshot = 9; // Used to tell the group when to take snapshot.
uint64 index = 10; // Used to store Raft index, in raft.Ready.
uint64 expected_checksum = 11; // Block an operation until membership reaches this checksum.

message KVS {

Large diffs are not rendered by default.

@@ -364,6 +364,22 @@ func (n *node) applyCommitted(proposal *pb.Proposal) error {

case len(proposal.CleanPredicate) > 0:
n.elog.Printf("Cleaning predicate: %s", proposal.CleanPredicate)
end := time.Now().Add(10 * time.Second)
for proposal.ExpectedChecksum > 0 && time.Now().Before(end) {
cur := atomic.LoadUint64(&groups().membershipChecksum)
if proposal.ExpectedChecksum == cur {
time.Sleep(100 * time.Millisecond)
glog.Infof("Waiting for checksums to match. Expected: %d. Current: %d\n",
proposal.ExpectedChecksum, cur)
if time.Now().After(end) {
"Giving up on predicate deletion: %q due to timeout. Wanted checksum: %d.",
proposal.CleanPredicate, proposal.ExpectedChecksum)
return nil
return posting.DeletePredicate(ctx, proposal.CleanPredicate)

case proposal.Delta != nil:
@@ -328,6 +328,7 @@ func (g *groupi) applyState(state *pb.MembershipState) {
// removing a freshly added node.

for _, member := range g.state.GetRemoved() {
// TODO: This leader check can be done once instead of repeatedly.
if member.GetGroupId() == g.Node.gid && g.Node.AmLeader() {
go func() {
// Don't try to remove a member if it's already marked as removed in
@@ -20,8 +20,6 @@ import (

@@ -193,25 +191,14 @@ func (w *grpcWorker) MovePredicate(ctx context.Context,
return &emptyPayload, errEmptyPredicate

// This loop ensures that we have seen the latest membership update following the move where
// this predicate now belongs to another group. Without this check, this group could end up
// serving transactions asking for this predicate, even after this tablet has been deleted. This
// issue is known to have caused Jepsen failures.
for in.ExpectedChecksum > 0 {
cur := atomic.LoadUint64(&groups().membershipChecksum)
if in.ExpectedChecksum == cur {
if ctx.Err() != nil {
return &emptyPayload, ctx.Err()
glog.Infof("Waiting for checksums to match. Expected: %d. Current: %d\n",
in.ExpectedChecksum, cur)
if in.DestGid == 0 {
glog.Infof("Was instructed to delete tablet: %v", in.Predicate)
p := &pb.Proposal{CleanPredicate: in.Predicate}
// Expected Checksum ensures that all the members of this group would block until they get
// the latest membership status where this predicate now belongs to another group. So they
// know that they are no longer serving this predicate, before they delete it from their
// state. Without this checksum, the members could end up deleting the predicate and then
// serve a request asking for that predicate, causing Jepsen failures.
p := &pb.Proposal{CleanPredicate: in.Predicate, ExpectedChecksum: in.ExpectedChecksum}
return &emptyPayload, groups().Node.proposeAndWait(ctx, p)
if err := posting.Oracle().WaitForTs(ctx, in.TxnTs); err != nil {
@@ -796,7 +796,8 @@ func processTask(ctx context.Context, q *pb.Query, gid uint32) (*pb.Result, erro
stop := x.SpanTimer(span, "processTask"+q.Attr)
defer stop()

span.Annotatef(nil, "Waiting for startTs: %d", q.ReadTs)
span.Annotatef(nil, "Waiting for startTs: %d at node: %d, gid: %d",
q.ReadTs, groups().Node.Id, gid)
if err := posting.Oracle().WaitForTs(ctx, q.ReadTs); err != nil {
return &pb.Result{}, err

0 comments on commit d61eb69

Please sign in to comment.