Merge #35261

35261: storage: Improve handling of lease index retries r=tbg,nvanbenschoten a=bdarnell Pipelined writes wait for commands to be evaluated, then detach before they apply. Retryable errors generated at evaluation time are handled correctly (by DistSender or above) even for pipelined writes, but pipelined writes have lost the ability to handle apply-time retry conditions in the executeWriteBatch loop (there used to be more of these, but illegal lease index is now the only condition retried at this level now). To remedy this, this commit moves reproposals due to illegal lease indexes below raft (but only on the proposing node) In practice, this can happen to writes that race with a raft leadership transfer. This is observable immediately after table creation, since table creation performs a split, and then may perform a lease transfer to balance load (and if the lease is transfer, raft leadership is transferred afterward). Specifically, 1. Lease is transferred from store s1 to s2. s1 is still raft leader. 2. Write w1 evaluates on store s2 and is assigned lease index i1. s2 forwards the proposal to s1. 3. s1 initiates raft leader transfer to s2. This puts it into a temporarily leaderless state so it drops the forwarded proposal. 4. s2 is elected raft leader, completing the transfer. 5. A second write w2 evalutes on s2, is assigned lease index i2, and goes right in the raft log since s2 is both leaseholder and leader. 6. s2 refreshes proposals as a side effect of becoming leader, and writes w1 to the log with lease index i1. 7. s2 applies w2, then w1. w1 fails because of the out of order lease index. If w1 was pipelined, the client stops listening after step 2, and won't learn of the failure until it tries to commit. At this point the commit returns an ASYNC_WRITE_FAILURE retry to the client. Note that in some cases, NotLeaseHolderError can be generated at apply time. These errors cannot be retried (since the proposer has lost its lease) so they will unavoidably result in an ASYNC_WRITE_FAILURE error to the client. However, this is uncommon - most NotLeaseHolderErrors are generated at evaluation time, which is compatible with pipelined writes. Fixes #28876 The fourth commit is the important part of this change. The fifth is optional; it's a simplification but it changes an under-tested error path. Assuming we like the fifth commit, I'd like to add a sixth that rips out proposalReevaluationReason and the executeWriteBatch/tryExecuteWriteBatch loop altogether. Co-authored-by: Ben Darnell <ben@bendarnell.com>
cockroachdb · Mar 19, 2019 · cf449c6 · cf449c6
2 parents 01f9662 + f7dc847
commit cf449c6
Show file tree

Hide file tree

Showing 9 changed files with 462 additions and 205 deletions.
diff --git a/pkg/storage/batcheval/cmd_lease_test.go b/pkg/storage/batcheval/cmd_lease_test.go
@@ -0,0 +1,111 @@
+// Copyright 2019 The Cockroach Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License. See the AUTHORS file
+// for names of contributors.
+
+package batcheval
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/cockroachdb/cockroach/pkg/base"
+	"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
+	"github.com/cockroachdb/cockroach/pkg/util/leaktest"
+	"github.com/cockroachdb/cockroach/pkg/util/log"
+)
+
+// TestLeaseTransferWithPipelinedWrite verifies that pipelined writes
+// do not cause retry errors to be leaked to clients when the error
+// can be handled internally. Pipelining dissociates a write from its
+// caller, so the retries of internally-generated errors (specifically
+// out-of-order lease indexes) must be retried below that level.
+//
+// This issue was observed in practice to affect the first insert
+// after table creation with high probability.
+func TestLeaseTransferWithPipelinedWrite(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+
+	ctx := context.Background()
+
+	tc := serverutils.StartTestCluster(t, 3, base.TestClusterArgs{})
+	defer tc.Stopper().Stop(ctx)
+
+	db := tc.ServerConn(0)
+
+	// More than 30 iterations is flaky under stressrace on teamcity.
+	for iter := 0; iter < 30; iter++ {
+		log.Infof(ctx, "iter %d", iter)
+		if _, err := db.ExecContext(ctx, "drop table if exists test"); err != nil {
+			t.Fatal(err)
+		}
+		if _, err := db.ExecContext(ctx, "create table test (a int, b int, primary key (a, b))"); err != nil {
+			t.Fatal(err)
+		}
+
+		workerErrCh := make(chan error, 1)
+		go func() {
+			workerErrCh <- func() error {
+				for i := 0; i < 1; i++ {
+					tx, err := db.BeginTx(ctx, nil)
+					if err != nil {
+						return err
+					}
+					defer func() {
+						if tx != nil {
+							if err := tx.Rollback(); err != nil {
+								log.Warningf(ctx, "error rolling back: %s", err)
+							}
+						}
+					}()
+					// Run two inserts in a transaction to ensure that we have
+					// pipelined writes that cannot be retried at the SQL layer
+					// due to the first-statement rule.
+					if _, err := tx.ExecContext(ctx, "INSERT INTO test (a, b) VALUES ($1, $2)", i, 1); err != nil {
+						return err
+					}
+					if _, err := tx.ExecContext(ctx, "INSERT INTO test (a, b) VALUES ($1, $2)", i, 2); err != nil {
+						return err
+					}
+					if err := tx.Commit(); err != nil {
+						return err
+					}
+					tx = nil
+				}
+				return nil
+			}()
+		}()
+
+		// TODO(bdarnell): This test reliably reproduced the issue when
+		// introduced, because table creation causes splits and repeated
+		// table creation leads to lease transfers due to rebalancing.
+		// This is a subtle thing to rely on and the test might become
+		// more reliable if we ran more iterations in the worker goroutine
+		// and added a second goroutine to explicitly transfer leases back
+		// and forth.
+
+		select {
+		case <-time.After(15 * time.Second):
+			// TODO(bdarnell): The test seems flaky under stress with a 5s
+			// timeout. Why? I'm giving it a high timeout since hanging
+			// isn't a failure mode we're particularly concerned about here,
+			// but it shouldn't be taking this long even with stress.
+			t.Fatal("timed out")
+		case err := <-workerErrCh:
+			if err != nil {
+				t.Fatalf("worker failed: %s", err)
+			}
+		}
+	}
+}
diff --git a/pkg/storage/replica.go b/pkg/storage/replica.go
@@ -988,13 +988,11 @@ type endCmds struct {
 
 // done releases the latches acquired by the command and updates
 // the timestamp cache using the final timestamp of each command.
-func (ec *endCmds) done(
-	br *roachpb.BatchResponse, pErr *roachpb.Error, retry proposalReevaluationReason,
-) {
+func (ec *endCmds) done(br *roachpb.BatchResponse, pErr *roachpb.Error) {
 	// Update the timestamp cache if the request is not being re-evaluated. Each
 	// request is considered in turn; only those marked as affecting the cache are
 	// processed. Inconsistent reads are excluded.
-	if retry == proposalNoReevaluation && ec.ba.ReadConsistency == roachpb.CONSISTENT {
+	if ec.ba.ReadConsistency == roachpb.CONSISTENT {
 		ec.repl.updateTimestampCache(&ec.ba, br, pErr)
 	}
 

diff --git a/pkg/storage/replica_proposal.go b/pkg/storage/replica_proposal.go
@@ -110,7 +110,7 @@ type ProposalData struct {
 // counted on to invoke endCmds itself.)
 func (proposal *ProposalData) finishApplication(pr proposalResult) {
 	if proposal.endCmds != nil {
-		proposal.endCmds.done(pr.Reply, pr.Err, pr.ProposalRetry)
+		proposal.endCmds.done(pr.Reply, pr.Err)
 		proposal.endCmds = nil
 	}
 	if proposal.sp != nil {
@@ -808,14 +808,12 @@ func (r *Replica) handleEvalResultRaftMuLocked(
 }
 
 // proposalResult indicates the result of a proposal. Exactly one of
-// Reply, Err and ProposalRetry is set, and it represents the result of
-// the proposal.
+// Reply and Err is set, and it represents the result of the proposal.
 type proposalResult struct {
-	Reply         *roachpb.BatchResponse
-	Err           *roachpb.Error
-	ProposalRetry proposalReevaluationReason
-	Intents       []result.IntentsWithArg
-	EndTxns       []result.EndTxnIntents
+	Reply   *roachpb.BatchResponse
+	Err     *roachpb.Error
+	Intents []result.IntentsWithArg
+	EndTxns []result.EndTxnIntents
 }
 
 // evaluateProposal generates a Result from the given request by